import requests
import re
import time
import hashlib
from pymysql.converters import escape_string
from mylib.module import *
def set_hash(string):
md5 = hashlib.md5()
md5.update(string.encode('utf-8'))
return md5.hexdigest()
if __name__ == '__main__':
db = DB('127.0.0.1', 'root', 'sqsyq402', 'hello')
for page in range(1, 11):
if page == 1:
p = ''
else:
p = '152_{}.html'.format(page)
url = 'https://www.shui5.cn/article/BeiJingShiCaiShuiFaGui/' + p
res = requests.get(url=url)
res.encoding = 'utf-8'
html = res.text
a_ls = re.findall(r'<div class="xwt2_a">(.*?)</div>', html)
for a in a_ls:
href = re.findall(r'href="(.*?)"', a)[0]
con_res = requests.get(url=href)
con_res.encoding = 'utf-8'
con_html = con_res.text
title = re.findall(r'<h1>(.*?)</h1>', con_html)[0]
title_hash = set_hash(title)
source = re.findall(r"<span>来源:(.*?)</span>", con_html)[0]
source_name = re.findall(r'target=_blank>(.*?)</a>', source)[0]
author_name = re.findall(r'<span class="fa">作者:(.*?)</span>', con_html)[0]
hot_url = re.findall(r'<span>人气:<script src="(.*?)"></script></script>', con_html)[0]
hot_num = re.findall(r'\d+', requests.get(url=hot_url).text)[0]
timer = re.findall(r'<span class="m_none">时间:(.*?)</span>', con_html)[0]
des = re.findall(r'<div class="articleDes">摘要:(.*?)</div>', con_html)[0].strip()
des = escape_string(des)
content = re.findall(r'<div class="arcContent" id="tupain">[\s\S]*?</div>', con_html)[0]
content = escape_string(content)
# print(title)
# print(source_name)
# print(author_name)
# print(hot_num)
# print(des)
# print(content)
sql = f'''insert into shui5 values(NULL, "{title}", "{title_hash}", "{source_name}",
"{author_name}", "{hot_num}", "{timer}", "{des}", "{content}", now())'''
db.insert(sql)
print(f'第{page}页爬完, 等待1s')
time.sleep(1)
python爬虫
发布时间 2023-07-18 10:49:50作者: hacker_dvd