古诗文网名句爬取
Python古诗文网名句爬取
请求URL
https://so.gushiwen.cn/mingjus/default.aspx?astr={author}&page={page}
参数astr为作者名或者朝代,参数page为页数
例如请求第一页李白的名句
https://so.gushiwen.cn/mingjus/default.aspx?astr=李白&page=1
网页解析
soup = BeautifulSoup(response.text, 'html.parser') sons_div = soup.find('div', {'class': 'sons'}) # 找到类名为sons的div标签 cont_divs = sons_div.find_all('div', {'class': 'cont'}) # 找到类名为cont的div标签
文本获取
# 遍历cont_divs,将a标签的内容存入Excel for i, cont_div in enumerate(cont_divs): a_tags = cont_div.find_all('a') for j, a_tag in enumerate(a_tags): sheet.cell(column=j+1, row=i+1, value=a_tag.text) # sheet.cell(row=j + 1, column=i + 1, value=a_tag.text) 数据倒置时行列互换
完整代码
# 导入所需库 import requests from bs4 import BeautifulSoup import openpyxl import random import tkinter as tk import time import sys import os # 创建Excel文件 wb = openpyxl.Workbook() sheet = wb.active # 搜索信息 def search(): author = author_entry.get() page = page_entry.get() # 请求网页 url = f'https://so.gushiwen.cn/mingjus/default.aspx?astr={author}&page={page}' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', 'X-Forwarded-For': '.'.join([str(random.randint(0, 255)) for _ in range(4)]) } response = requests.get(url, headers=headers) # 解析网页 soup = BeautifulSoup(response.text, 'html.parser') sons_div = soup.find('div', {'class': 'sons'}) # 找到类名为sons的div标签 cont_divs = sons_div.find_all('div', {'class': 'cont'}) # 找到类名为cont的div标签 # 遍历cont_divs,将a标签的内容存入Excel for i, cont_div in enumerate(cont_divs): a_tags = cont_div.find_all('a') for j, a_tag in enumerate(a_tags): sheet.cell(column=j+1, row=i+1, value=a_tag.text) # sheet.cell(row=j + 1, column=i + 1, value=a_tag.text) 数据倒置时行列互换 # 保存Excel文件 timestamp = str(int(time.time())) try: wb.save('古诗文网-' + author + '-页' + page + '-' + timestamp + '.xlsx') except: raise Exception('保存失败') # by kuiwaiwai 20230322 1.0 # 创建界面 root = tk.Tk() root.title('古诗文网名句下载') root.geometry('250x200') # 创建标签 author_label = tk.Label(root, text='作者/朝代:') author_label.grid(row=0, column=0, padx=10, pady=10) page_label = tk.Label(root, text='页数:') page_label.grid(row=1, column=0, padx=10, pady=10) # 创建输入框 author_entry = tk.Entry(root) author_entry.grid(row=0, column=1, padx=10, pady=10) page_entry = tk.Entry(root) page_entry.grid(row=1, column=1, padx=10, pady=10) # 创建按钮 search_button = tk.Button(root, text='搜索并下载', command=search) search_button.grid(row=2, column=0, columnspan=2, padx=10, pady=10) # 作者声明 text_label = tk.Label(root, text='by kuiwaiwai 1.0版本') text_label.grid(row=2, column=2, columnspan=2, padx=10, pady=10) # 其它事项 text_label = tk.Label(root, text='例子:\n作者/朝代: 李白\n页数:1') text_label.grid(row=3, column=0, columnspan=2, padx=10, pady=10) if __name__ == '__main__': if getattr(sys, 'frozen', False): os.chdir(sys._MEIPASS) root.mainloop()
阅读剩余
版权声明:
作者:kuiwaiwai
链接:https://www.kuiwaiwai.com/blog-article/course/gushiwenwangmingjupython
文章版权归作者所有,未经允许请勿转载。
THE END