古诗文网名句爬取

Python古诗文网名句爬取

请求URL

https://so.gushiwen.cn/mingjus/default.aspx?astr={author}&page={page}

参数astr为作者名或者朝代,参数page为页数

例如请求第一页李白的名句

https://so.gushiwen.cn/mingjus/default.aspx?astr=李白&page=1

网页解析

soup = BeautifulSoup(response.text, 'html.parser')
sons_div = soup.find('div', {'class': 'sons'})  # 找到类名为sonsdiv标签
cont_divs = sons_div.find_all('div', {'class': 'cont'})  # 找到类名为contdiv标签

文本获取

# 遍历cont_divs,将a标签的内容存入Excel
for i, cont_div in enumerate(cont_divs):
    a_tags = cont_div.find_all('a')
    for j, a_tag in enumerate(a_tags):
        sheet.cell(column=j+1, row=i+1, value=a_tag.text)
        # sheet.cell(row=j + 1, column=i + 1, value=a_tag.text) 数据倒置时行列互换

完整代码

# 导入所需库
import requests
from bs4 import BeautifulSoup
import openpyxl
import random
import tkinter as tk
import time
import sys
import os

# 创建Excel文件
wb = openpyxl.Workbook()
sheet = wb.active

# 搜索信息
def search():
    author = author_entry.get()
    page = page_entry.get()

    # 请求网页
    url = f'https://so.gushiwen.cn/mingjus/default.aspx?astr={author}&page={page}'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'X-Forwarded-For': '.'.join([str(random.randint(0, 255)) for _ in range(4)])
    }
    response = requests.get(url, headers=headers)

    # 解析网页
    soup = BeautifulSoup(response.text, 'html.parser')
    sons_div = soup.find('div', {'class': 'sons'})  # 找到类名为sonsdiv标签
    cont_divs = sons_div.find_all('div', {'class': 'cont'})  # 找到类名为contdiv标签

    # 遍历cont_divs,将a标签的内容存入Excel
    for i, cont_div in enumerate(cont_divs):
        a_tags = cont_div.find_all('a')
        for j, a_tag in enumerate(a_tags):
            sheet.cell(column=j+1, row=i+1, value=a_tag.text)
            # sheet.cell(row=j + 1, column=i + 1, value=a_tag.text) 数据倒置时行列互换
    # 保存Excel文件
    timestamp = str(int(time.time()))
    try:
        wb.save('古诗文网-' + author + '-' + page + '-' + timestamp + '.xlsx')
    except:
        raise Exception('保存失败')

# by kuiwaiwai 20230322 1.0

# 创建界面
root = tk.Tk()
root.title('古诗文网名句下载')
root.geometry('250x200')

# 创建标签
author_label = tk.Label(root, text='作者/朝代:')
author_label.grid(row=0, column=0, padx=10, pady=10)
page_label = tk.Label(root, text='页数:')
page_label.grid(row=1, column=0, padx=10, pady=10)

# 创建输入框
author_entry = tk.Entry(root)
author_entry.grid(row=0, column=1, padx=10, pady=10)
page_entry = tk.Entry(root)
page_entry.grid(row=1, column=1, padx=10, pady=10)

# 创建按钮
search_button = tk.Button(root, text='搜索并下载', command=search)
search_button.grid(row=2, column=0, columnspan=2, padx=10, pady=10)

# 作者声明
text_label = tk.Label(root, text='by kuiwaiwai 1.0版本')
text_label.grid(row=2, column=2, columnspan=2, padx=10, pady=10)

# 其它事项
text_label = tk.Label(root, text='例子:\n作者/朝代: 李白\n页数:1')
text_label.grid(row=3, column=0, columnspan=2, padx=10, pady=10)

if __name__ == '__main__':
    if getattr(sys, 'frozen', False):
        os.chdir(sys._MEIPASS)
    root.mainloop()
阅读剩余
THE END