|

楼主 |
发表于 2020-10-8 00:58:57
|
显示全部楼层
本帖最后由 Tinken 于 2020-10-8 01:06 编辑
今天没有太多时间,晚上花了点时间调整Excel写入,现在基本搞定剩余就是ip切换和异常处理,然后后面需要使用while死循环,避免因为异常而中断
代码:- #!/usr/bin/env python
- # -*- coding:utf-8 -*-
- # @ClassName test7
- # @Description TODO 爬当当网全部图书数据
- # @Author lanlo
- # @Date 2020-10-05 22:36
- # @Version 1.0
- import requests
- from bs4 import BeautifulSoup
- import redis
- # Excel操作库
- import xlwings as xw
- # 系统时间库
- import time
- import proxy_pool
- import re
- # 本地redis连接
- pool = redis.ConnectionPool(host='localhost', port=6379, decode_responses=True)
- # 存储爬取的代理ip
- redis_conn = redis.Redis(connection_pool=pool, max_connections=10, db=0)
- # xlwings 写入Excel => books.xlsx
- def writeExcel(list):
- # 这将创建一个新的工作簿
- wb = xw.Book('books.xlsx')
- # 实例化工作表对象
- sht = wb.sheets['Sheet1']
- sht.api.Rows(1).Insert()
- sht.range('A1').value = list
- # 保存
- wb.save()
- def get_book():
- # 模拟浏览器的请求头
- headers = {
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36 Edg/ 5.0.564.68'
- }
- proxy = proxy_pool.get_proxy_ip()
- if len(proxy) != 0:
- # 拼接url
- for i in range(27866721, 27866800):
- list_book = []
- url = "http://product.dangdang.com/{}.html".format(str(i))
- try:
- # 发起http请求,获取页面
- print("请求页面:{}".format(url))
- res = requests.get(url,
- headers=headers,
- proxies=proxy,
- timeout=5)
- except Exception:
- print("请求失败:{}".format(url))
- print("代理ip:{}".format(proxy))
- list_book.append("请求失败")
- pass
- res_code = res.status_code
- list_book.append(url)
- print("请求结果:{}".format(res_code))
- if res_code == 200:
- # 设置解析器
- soup = BeautifulSoup(res.text, "html.parser")
- # 获得数据内容
- print(soup.prettify())
- # 书名
- title = soup.find_all('h1')[0].text.replace("\n", '').replace("\t", "").replace(" ", "")
- list_book.append(title)
- print("书名:{}".format(title))
- # 作者
- author = soup.find_all("a", dd_name='作者')[0].string
- list_book.append(author)
- print("作者:{}".format(author))
- # 出版社
- press = soup.find_all("a", dd_name='出版社')[0].string
- list_book.append(press)
- print("出版社:{}".format(press))
- # 出版时间
- publication_time = soup.find_all("span", string=re.compile("出版时间"))[0].string.split(":")[1].replace("\n", '').replace("\t", "").replace(" ", "")
- list_book.append(publication_time)
- print("出版时间:{}".format(publication_time))
- for child in soup.find_all("ul", class_='key clearfix')[0]:
- book_info = str(child.string).replace("\n", '').replace("\t", "").replace(" ", "").replace("None", "")
- if book_info != "":
- print(book_info.split(":")[1])
- list_book.append(book_info.split(":")[1])
- # 分类
- book_class = soup.find_all("li", class_="clearfix fenlei")[0].text.split(":")[1]
- list_book.append(book_class)
- print("分类:{}".format(book_class))
- print(list_book)
- else:
- list_book.append("请求失败")
- # 写入Excel
- writeExcel(list_book)
- # 延时3秒
- time.sleep(3)
- print("get_book 结束")
- # python程序入口函数
- if __name__ == '__main__':
- # 星期 月 日 时:分:秒 年(Mon Oct 5 23:10:13 2020)
- print("程序开始时间:{}".format(time.ctime()))
- get_book()
复制代码 最后稍微处理一下,添加表头:
|
|