python爬虫

发布时间 2023-07-18 10:49:50作者: hacker_dvd
import requests
import re
import time
import hashlib
from pymysql.converters import escape_string
from mylib.module import *

def set_hash(string):
    md5 = hashlib.md5()
    md5.update(string.encode('utf-8'))
    return md5.hexdigest()

if __name__ == '__main__':
    db = DB('127.0.0.1', 'root', 'sqsyq402', 'hello')
    for page in range(1, 11):
        if page == 1:
            p = ''
        else:
            p = '152_{}.html'.format(page)

        url = 'https://www.shui5.cn/article/BeiJingShiCaiShuiFaGui/' + p
        res = requests.get(url=url)
        res.encoding = 'utf-8'
        html = res.text
        a_ls = re.findall(r'<div class="xwt2_a">(.*?)</div>', html)
        for a in a_ls:
            href = re.findall(r'href="(.*?)"', a)[0]
            
            con_res = requests.get(url=href)
            con_res.encoding = 'utf-8'
            con_html = con_res.text

            title = re.findall(r'<h1>(.*?)</h1>', con_html)[0]
            title_hash = set_hash(title)
            source = re.findall(r"<span>来源:(.*?)</span>", con_html)[0]
            source_name = re.findall(r'target=_blank>(.*?)</a>', source)[0]
            author_name = re.findall(r'<span class="fa">作者:(.*?)</span>', con_html)[0]
            hot_url = re.findall(r'<span>人气:<script src="(.*?)"></script></script>', con_html)[0]
            hot_num = re.findall(r'\d+', requests.get(url=hot_url).text)[0]
            timer = re.findall(r'<span class="m_none">时间:(.*?)</span>', con_html)[0]
            des = re.findall(r'<div class="articleDes">摘要:(.*?)</div>', con_html)[0].strip()
            des = escape_string(des)
            content = re.findall(r'<div class="arcContent" id="tupain">[\s\S]*?</div>', con_html)[0]
            content = escape_string(content)
            # print(title)
            # print(source_name)
            # print(author_name)
            # print(hot_num)
            # print(des)
            # print(content)
            sql = f'''insert into shui5 values(NULL, "{title}", "{title_hash}", "{source_name}", 
            "{author_name}", "{hot_num}", "{timer}", "{des}", "{content}", now())'''
            db.insert(sql)

        print(f'第{page}页爬完, 等待1s')
        time.sleep(1)