scrapy 请求meta参数使用案例-豆瓣电影爬取

发布时间 2023-07-17 11:46:18作者: 蕝戀

num = 0

import scrapy
from scrapy.http import HtmlResponse

from scrapy_demo.items import DoubanItem

"""
这个例子主要是学习meta传参。
"""

class DoubanSpider(scrapy.Spider):
    """爬取豆瓣top250"""
    
    name = "douban"
    allowed_domains = ["douban.com"]
    start_urls = ["https://movie.douban.com/top250"]

    def parse(self, response: HtmlResponse, **kwargs):

        item_list_xpath = '//div[@class="article"]/ol/li/div[@class="item"]'
        selector_list = response.xpath(item_list_xpath)
        
        num = 0
        
        for selector in selector_list:
          # 不爬太多数据了...
            if num >= 2:
                break
            num+=1
            
            doubanitem = DoubanItem()
            # 取出的数据有 这个在unicode中会显示\xa0,所以替换成空格
            title = selector.xpath('./div[@class="info"]/div[@class="hd"]/a/span[@class="title"]/text()').get()\
                .strip().replace('\xa0', ' ')

            summary = selector.xpath('./div[@class="info"]/div[@class="bd"]/p[not(@class="quote")]/text()').get()\
                .strip().replace('\xa0', " ")

            score = selector.xpath(
                './div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').get()


            detail_url = selector.xpath('./div[@class="info"]/div[@class="hd"]/a/@href').get()

            # print(detail_url)

            doubanitem["title"] = title
            doubanitem["score"] = score
            doubanitem["summary"] = summary

            # yield doubanitem

            # 访问详情页
            yield scrapy.Request(
                url=detail_url, callback=self.parse_detail, meta={"item": doubanitem},
                headers={
                    "Host": "movie.douban.com",
                    "Referer": "https://movie.douban.com/top250",
                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
                }
            )

    def parse_detail(self, response: HtmlResponse):
        print("enter detail parse")
        # print(response.text)
        doubanitem = response.meta.get("item")
        desc = response.xpath('//span[@property="v:summary"]/text()').get()
        # callback中的修改对传递进来的meta中的字典不会影响原来字典中的值,这玩意估计是deepcopy一份了,,,草...
        doubanitem["desc"] = desc
        yield doubanitem