异步爬虫例子之asyncio

发布时间 2023-04-29 19:32:58作者: 不同凡响的太阳

异步爬虫例子:

import time

import aiohttp
import asyncio
import re
import os

os.environ['NO_PROXY'] = 'www.baidu.com'





class Asyn():

    def __init__(self):
        self.__headers = {
                            'authority': 'go.drugbank.com',
                            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
                            'cache-control': 'no-cache',
                            'cookie': 'xxxxx',
                            'referer': 'https://go.drugbank.com/unearth/q?query=*&button=&searcher=drugs',
                            'sec-ch-ua': '"Chromium";v="112", "Microsoft Edge";v="112", "Not:A-Brand";v="99"',
                            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
                        }

    async def __fetch(self, session, url):
        print("发送请求:", url)
        async with session.get(url, verify_ssl=False, headers=self.__headers) as response:
            content = await response.text()
            # print(content)
            try:
                info1 = re.findall(r'href="/indications/.*?">(.*?)</a', content)  # 第一列表
                if not info1:
                    print(f"最大限度页")
                    return
            except Exception as e:
                print(f"最大限度页,error={e}")
                return
            # print(info1)
            # print(len(info1))
            info2 = re.findall(r'<div class="db-matches"><a (.*?)</a></div>', content)
            info2_new = []  # 第二列表
            for i in info2:
                # i = i.replace('href="/drugs/', '').replace('">', ':').replace('</a>', '').replace('<a', '')
                i = i.replace('href="/drugs/', '').replace('">', ':').replace('</a>', '').replace('<a', '').replace(' / ',
                                                                                                                    '【/】')  # 修改的

                # print(i)
                info2_new.append(i)
            # print(len(info1), info1)
            # print(len(info2_new), info2_new)

            for yaoming, chenfen in zip(info1, info2_new):
                dic = {
                    "药名": yaoming,
                    "成分": chenfen
                }
                # total_list.append(dic)
                print(dic)
            with open('异步采集.txt', 'a', encoding='utf-8') as f:
                f.write(f'{len(info1), info1}\n{len(info2_new), info2_new}\n')
            # time.sleep(0.5)


    async def main(self):
        page = int(input("输入页数:"))
        async with aiohttp.ClientSession() as session:
            url_list = [f'https://go.drugbank.com/unearth/q?approved=1&ca=0&eu=0&page={i}&query=%2A&searcher=indications'
                        for i in range(1, page + 1)]
            tasks = [asyncio.create_task(self.__fetch(session, url)) for url in url_list]
            await asyncio.wait(tasks)


if __name__ == '__main__':
    spide = Asyn()
    asyncio.run(spide.main())

除了get请求,aiohttp还支持其它请求类型,如POST、PUT、DELETE等,和requests使用方式类似。

可获取:

async with aiohttp.ClientSession() as session:
        async with session.post('https://www.httpbin.org/post', data=data) as response:
            print('status:', response.status)  # 状态码
            print('headers:', response.headers)  # 响应头
            print('body:', await response.text())  # 响应体
            print('bytes:', await response.read())  # 响应体二进制内容
            print('json:', await response.json())  # 响应体json数据

可设置:# 设置 1 秒的超时

async def main():
    # 设置 1 秒的超时 
    timeout = aiohttp.ClientTimeout(total=1)
    async with aiohttp.ClientSession(timeout=timeout) as session:
        url_list = [f'https://go.drugbank.com/unearth/q?approved=1&ca=0&eu=0&page={i}&query=%2A&searcher=indications'
                    for i in range(1, 200)]
        # print(url_list)
        tasks = [asyncio.create_task(fetch(session, url)) for url in url_list]
        await asyncio.wait(tasks)

更多参考:https://z.itpub.net/article/detail/602E65B824B2FC8A6AB5BDC2A1279822