num = 0
import scrapy
from scrapy.http import HtmlResponse
from scrapy_demo.items import DoubanItem
"""
这个例子主要是学习meta传参。
"""
class DoubanSpider(scrapy.Spider):
"""爬取豆瓣top250"""
name = "douban"
allowed_domains = ["douban.com"]
start_urls = ["https://movie.douban.com/top250"]
def parse(self, response: HtmlResponse, **kwargs):
item_list_xpath = '//div[@class="article"]/ol/li/div[@class="item"]'
selector_list = response.xpath(item_list_xpath)
num = 0
for selector in selector_list:
# 不爬太多数据了...
if num >= 2:
break
num+=1
doubanitem = DoubanItem()
# 取出的数据有 这个在unicode中会显示\xa0,所以替换成空格
title = selector.xpath('./div[@class="info"]/div[@class="hd"]/a/span[@class="title"]/text()').get()\
.strip().replace('\xa0', ' ')
summary = selector.xpath('./div[@class="info"]/div[@class="bd"]/p[not(@class="quote")]/text()').get()\
.strip().replace('\xa0', " ")
score = selector.xpath(
'./div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').get()
detail_url = selector.xpath('./div[@class="info"]/div[@class="hd"]/a/@href').get()
# print(detail_url)
doubanitem["title"] = title
doubanitem["score"] = score
doubanitem["summary"] = summary
# yield doubanitem
# 访问详情页
yield scrapy.Request(
url=detail_url, callback=self.parse_detail, meta={"item": doubanitem},
headers={
"Host": "movie.douban.com",
"Referer": "https://movie.douban.com/top250",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
)
def parse_detail(self, response: HtmlResponse):
print("enter detail parse")
# print(response.text)
doubanitem = response.meta.get("item")
desc = response.xpath('//span[@property="v:summary"]/text()').get()
# callback中的修改对传递进来的meta中的字典不会影响原来字典中的值,这玩意估计是deepcopy一份了,,,草...
doubanitem["desc"] = desc
yield doubanitem