数据采集与融合技术实践作业3-JZTXT

作业①:

要求：

指定一个网站，爬取这个网站中的所有的所有图片，例如：中国气象网（http://www.weather.com.cn）。使用scrapy框架分别实现单线程和多线程的方式爬取。
–务必控制总页数（学号尾数2位）、总下载的图片数量（尾数后3位）等限制爬取的措施。

输出信息:

将下载的Url信息在控制台输出，并将下载的图片存储在images子文件中，并给出截图。

Gitee文件夹链接

https://gitee.com/li-zimusuidao/data-fusion

代码：

from lxml import etree
import requests
import os
if __name__ == '__main__':
    s=0
    url = 'https://www.vcg.com/creative-image'
    header = {
            'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36 Edg/102.0.1245.39'
    }
    response = requests.get(url=url, headers=header)
    response.encoding = 'utf-8'
    page_text = response.text
    tree = etree.HTML(page_text)
    figure_list = tree.xpath('//div[@class="gallery_inner"]/figure')
    if not os.path.exists('./102102119'):
        os.makedirs('./102102119')
    for figure in figure_list:
        try:
            img_src = figure.xpath('./a/img/@data-src')[0]
        except(IndexError):
            print('未成功匹配到字段')
        img_src = 'https:' + img_src
        img_name = img_src.split('/')[-1]
        try:
            img_data = requests.get(url=img_src, headers=header).content
        except(requests.exceptions.InvalidURL):
            print('没有访问地址')
        img_path = '102102119/' + img_name
        with open(img_path, 'wb') as fp:
            fp.write(img_data)
            print(img_name, '下载成功')
            s=s+1
            if(s>=119):
                break

结果：

心得体会：

本次任务将1个大的网站所有图像的下载。通过这个任务，我学习了lxml的运用和字段的匹配。作业其实不是特别难，但是寻找图片是最困难的一步。我试了很多种方法，终于成功了。

作业②

要求：

熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法；使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。
候选网站：东方财富网：https://www.eastmoney.com/

输出信息：

MySQL数据库存储和输出格式如下：
表头英文命名例如：序号id，股票代码：bStockNo……，由同学们自行定义设计

序号	股票代码	股票名称	最新报价	涨跌幅	涨跌额	成交量	振幅	最高	最低	今开	昨收
1	688093	N世华	28.47	10.92	26.13万	7.6亿	22.34	32.0	28.08	30.20	17.55
2	……

Gitee文件夹链接

https://gitee.com/li-zimusuidao/data-fusion

代码：

ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
    'scrapy_splash.SplashCookiesMiddleware': 723,
    'scrapy_splash.SplashMiddleware': 725,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810
}
ITEM_PIPELINES = {
   'getStock.pipelines.GetstockPipeline': 300,
}
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
SPLASH_URL = "http://192.168.5.185:8050/"
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
RETRY_HTTP_CODES = [500, 502, 503, 504, 400, 403, 404, 408]
import scrapy
class GetstockItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()
    code = scrapy.Field()
    nprice = scrapy.Field()
    op = scrapy.Field()
    ed = scrapy.Field()
    high = scrapy.Field()
    low = scrapy.Field()
    volume = scrapy.Field()
    date = scrapy.Field()
    hangye = scrapy.Field()
    suggest = scrapy.Field()
import pyodbc
import requests
class GetstockPipeline(object):
    def __init__(self):
        self.conn = pyodbc.connect("DRIVER={SQL SERVER};SERVER=服务器名;UID=用户名;PWD=密码;DATABASE=数据库名")
        self.cursor = self.conn.cursor()
    def process_item(self,item,spider):
        try:
            sql = "INSERT INTO dbo.stock_data(股票代码,股票名称,现时价格,昨收价格,开盘价格,最高价格,最低价格,总交易额,所属行业,日期) VALUES('%s','%s','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%s','%s')"
            data = (item['code'],item['name'],item['nprice'],item['ed'],item['op'],item['high'],item['low'],item['volume'],item['hangye'],item['date'])
            self.cursor.execute(sql % data)
            try:
                sql = "update dbo.stock_data set 初步建议='%s' where dbo.stock_data.股票代码=%s"
                data = (item['suggest'],item['code'])
                self.cursor.execute(sql % data)
                print('success')
            except:
                sql = "update dbo.stock_data set 初步建议='该股票暂无初步建议' where dbo.stock_data.股票代码=%s"
                data = item['code']
                self.cursor.execute(sql % data)
                print("该股票暂无初步建议")
            self.conn.commit()
            print('信息写入成功')
        except Exception as ex:
            print(ex)
        return item
from scrapy import Spider, Request
#from scrapy_splash import SplashRequest
#import SplashRequest
import scrapy
import re
#from getStock.items import GetstockItem
#import GetstockItem
headers = {
    'User-Agent': 'User-Agent:Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'
}
class StockSpider(scrapy.Spider):
    name = 'stock'
    start_urls = ['http://quote.eastmoney.com']
    def start_requests(self):
        url = 'http://quote.eastmoney.com/stock_list.html'
        #yield request(url, self.parse, headers=headers)
        #yield SplashRequest(url, self.parse, headers=headers)
    def zymk(self, response):
        for href in response.css('a::attr(href)').extract():
            try:
                stock = re.findall(r"\d{6}", href)[0]
                url = 'http://q.stock.sohu.com/cn/' + stock
                #yield request(url, self.zymk1, args={'wait': 5}, headers=headers)
                #yield SplashRequest(url, self.zymk1,args={'wait':5}, headers=headers )
            except:
                continue
    def zymk1(self, response):
        item = GetstockItem()
        try:
            stockinfo = response.xpath('// *[ @ id = "contentA"] / div[2] / div / div[1]')
            item['name'] = stockinfo.xpath('//*[@class="name"]/a/text()').extract()[0]
            item['code'] = stockinfo.xpath('//*[@class="code"]/text()').extract()[0].replace('(','').replace(')','')
            item['date'] = stockinfo.xpath('//*[@class="date"]/text()').extract()[0]
            item['nprice'] = float(stockinfo.xpath('//li[starts-with(@class,"e1 ")]/text()').extract()[0])
            item['high'] = float(response.xpath('//*[@id="FT_priceA2"]/tbody/tr[1]/td[5]/span/text()').extract()[0])
            item['low'] = float(response.xpath('//*[@id="FT_priceA2"]/tbody/tr[2]/td[5]/span/text()').extract()[0])
            item['ed'] = float(response.xpath('//*[@id="FT_priceA2"]/tbody/tr[1]/td[7]/span/text()').extract()[0])
            item['op'] = float(response.xpath('//*[@id="FT_priceA2"]/tbody/tr[2]/td[7]/span/text()').extract()[0])
            item['volume'] = float(response.xpath('//*[@id="FT_priceA2"]/tbody/tr[2]/td[3]/span/text()').extract()[0].replace('亿',''))
            item['hangye'] = response.xpath('//*[@id="FT_sector"] / div / ul / li[1]/a/text()').extract()[0]
            suggests = response.xpath('//*[@id="contentA"]/div[2]/div/div[3]/div[2]/div[2]/div[1]/div[2]/table/tbody/tr[1]')
            item['suggest'] = suggests.xpath('//td[starts-with(@class,"td1 ")]/span/text()').extract()[0]
        except:
            pass
        yield item

结果：

心得体会：

本次实验我使用了scrapy 中Item、Pipeline 据的序列化输出方法，难点在于如何调用服务器端口。主要的解决方法是：一是查找服务器端口，二是使用函数找到标签。

作业③:

要求：

熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法；使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
候选网站：中国银行网：https://www.boc.cn/sourcedb/whpj/

输出信息：

Currency	TBP	CBP	TSP	CSP	Time
阿联酋迪拉姆	198.58	192.31	199.98	206.59	11:27:14

Gitee文件夹链接

https://gitee.com/li-zimusuidao/data-fusion

代码：

import scrapy
from bs4 import UnicodeDammit
#from ..items import WaihuiItem
class spider_Waihui(scrapy.Spider):
    name = "spiderWaihui"
    start_urls=["http://fx.cmbchina.com/hq/"]
    def parse(self, response):
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", 'gbk'])
            data = dammit.unicode_markup
            # print(data)
            selector = scrapy.Selector(text=data)
            trs = selector.xpath("//div[@id='realRateInfo']/table/tr")
            # print(trs)
            for tr in trs[1:]:
                item=Zymk()
                a =tr.xpath("./td[position()=1][@class='fontbold']/text()").extract_first()
                item["type"] = str(a).strip()
                item["tsp"] = str(tr.xpath("./td[position()=4][@class='numberright']/text()").extract_first()).strip()
                item["csp"] = str(tr.xpath("./td[position()=5][@class='numberright']/text()").extract_first()).strip()
                item["tbp"] = str(tr.xpath("./td[position()=6][@class='numberright']/text()").extract_first()).strip()
                item["cbp"] = str(tr.xpath("./td[position()=7][@class='numberright']/text()").extract_first()).strip()
                item["time"] = str(tr.xpath("./td[position()=8][@align='center']/text()").extract_first()).strip()
                yield item
        except Exception as err:
            print(err)
import scrapy
class Zymk(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    type = scrapy.Field()
    tsp = scrapy.Field()
    csp = scrapy.Field()
    tbp = scrapy.Field()
    cbp = scrapy.Field()
    time = scrapy.Field()
import pymysql
class WaihuiPipeline:
    def open_spider(self,spider):
        print("opened")
        try:
            self.con=pymysql.connect(host="127.0.0.1",port=3306,user="root",
                                     passwd="hadoop",db="mydb",charset="utf8")
            self.cursor=self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from waihui")
            self.opened=True
            self.count=0
        except Exception as err:
            print(err)
            self.opened=False
    def close_spider(self,spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened=False
        print("closed")
        print("总共爬取",self.count,"条")
    def process_item(self, item, spider):
        try:
            if self.opened:
                self.cursor.execute("insert into waihui(btype,btsp,bcsp,btbp,bcbp,btime) values (%s,%s,%s,%s,%s,%s)",(item["type"],item["tsp"],item["csp"],item["tbp"],item["cbp"],item["time"]))
                self.count+=1
        except Exception as err:
            print(err)
        return item

结果：

心得体会：

本次实验我使用了scrapy 中Item、Pipeline 据的序列化输出方法，难点在于如何调用服务器端口。主要的解决方法是：一是查找服务器端口，二是定义1个标签类并寻找到标签。