滚动刷新式网页的爬取方式

发布时间 2023-07-22 10:47:13作者: hacker_dvd
from lxml import etree
import re
import requests
import os
import time
from fake_useragent import UserAgent


# 滚动式刷新的爬取方法
base_url = 'https://www.pearvideo.com/'
for page in range(1, 4):
    new_page = (page - 1) * 24
    url = f'https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start={new_page}'
    headers = {'user-agent': UserAgent().random}
    res = requests.get(url=url, headers=headers)
    html = res.text
    e = etree.HTML(html)
    href = e.xpath('//a[@class="vervideo-lilink actplay"]/@href')
    for video_id in href:
        cont_id = video_id.split('_')[-1]
        status_url = f'https://www.pearvideo.com/videoStatus.jsp?contId={cont_id}'
        # 添加来源
        headers['referer'] = base_url + video_id
        res = requests.get(url=status_url, headers=headers)
        data = res.json()
        video_url = data['videoInfo']['videos']['srcUrl']
        video_url = video_url.replace(data['systemTime'], 'cont-' + cont_id)
        video_res = requests.get(url=video_url, headers=headers)
        video_data = video_res.content
        with open(f'./video/{cont_id}.mp4', 'wb') as f:
            f.write(video_data)
        print(f'{cont_id}下载完成')
        time.sleep(100)