批量获取www.kegg.jp的数据

发布时间 2023-11-03 21:14:43作者: byxiaobai

代码如下:

import requests
from bs4 import BeautifulSoup
import re

def visit2(url):
    response = requests.get(url)
    # 检查响应是否成功
    if response.status_code == 200:
        # 使用BeautifulSoup解析HTML
        soup = BeautifulSoup(response.text, "html.parser")
        
        # 查找<pre></pre>标签并提取内容
        pre_tag = soup.find("pre")
        if pre_tag:
            content = pre_tag.get_text()
            # 向ans.txt文件中追加content。如果没有这个文件则创建
            with open("ans.txt", "a", encoding="utf-8") as file:
                file.write(content)
        else:
            print("未找到<pre></pre>标签")
    else:
        print("请求失败,状态码:", response.status_code)

def visit1(url, content):
    response = requests.get(url)
    print("正在下载,url:", url)

    # 检查响应是否成功
    if response.status_code == 200:
        # 使用BeautifulSoup解析HTML
        soup = BeautifulSoup(response.text, "html.parser")
        content = response.text
        # 使用正则表达式提取正确的条件
        match = re.search(r"onclick=\"location.href='(.*?)';return false;\">AA seq</button>", content)
        if match:
            location = "https://www.kegg.jp" + match.group(1)
            visit2(location)
        else:
            print("未找到匹配的文本")
    else:
        print("请求失败,状态码:", response.status_code)

if __name__ == '__main__':
    # 发送HTTP请求获取页面内容
    url = "https://www.kegg.jp/entry/K01068"
    response = requests.get(url)

    # 检查响应是否成功
    if response.status_code == 200:
        # 使用BeautifulSoup解析HTML
        soup = BeautifulSoup(response.text, "html.parser")
        
        # 找到所有td元素,其中class="td41 defd"
        td_elems = soup.find_all("td", class_="td41 defd")
        
        if len(td_elems) >= 4:
            # 提取第四个匹配的元素的HTML内容
            third_td_elem = td_elems[3]
            content = third_td_elem.prettify()
            
            # 使用正则表达式提取链接
            links = re.findall(r'a href="(.*?)"', content)
            for link in links:
                if "javascript:void(0)" in link:
                    continue
                visit1("https://www.kegg.jp" + link, content)
        else:
            print("未找到足够的匹配元素")
    else:
        print("请求失败,状态码:", response.status_code)