import requests
from bs4 import BeautifulSoup
import time
import os
# 获取所有章节的链接
def get_novel_chapters():
url_root = "/2/2665/"
r = requests.get(url_root)
r.encoding = 'gbk'
soup = BeautifulSoup(r.text, 'html.parser')
# 储存数据
data = []
for dd in soup.find_all("dd"):
link = dd.find("a")
if not link:
continue
data.append(("%s" % link['href'], link.get_text()))
return data
# 抓取章节的正文
def get_chapter_content(url):
r = requests.get(url)
r.encoding = 'gbk'
soup = BeautifulSoup(r.text, 'html.parser') # 解析数据
return soup.find("div", id="content").get_text().strip().replace("xa0*4", "\n\n")
# 小说保存文件名称
path = '太古星辰诀'
if not os.path.exists(path):
os.makedirs(path)
novel_chapters = get_novel_chapters()
total_cnt = len(novel_chapters)
idx = 0
for chapter in get_novel_chapters():
idx += 1
url, title = chapter
print(chapter)
print("下载中---------->",title)
time.sleep(3)
filename = path + '/' + '{}.txt'.format(title)
# 保存数据
with open(filename, "w", encoding='utf-8') as fout:
fout.write(get_chapter_content(url))