作业内容
• 作业①:
o 要求:
▪ 熟练掌握 Selenium 查找 HTML 元素、爬取 Ajax 网页数据、等待 HTML 元素等内
容。
▪ 使用 Selenium 框架+ MySQL 数据库存储技术路线爬取“沪深 A 股”、“上证 A 股”、
“深证 A 股”3 个板块的股票数据信息。
o 候选网站:东方财富网:
http://quote.eastmoney.com/center/gridlist.html#hs_a_board
o 输出信息:MYSQL 数据库存储和输出格式如下,表头应是英文命名例如:序号
id,股票代码:bStockNo……,由同学们自行定义设计表头:
• Gitee 文件夹链接:作业1
代码如下
爬取股票
from selenium import webdriver
from selenium.webdriver.common.by import By
import sqlite3
from lxml import etree
from selenium.webdriver.edge.options import Options
import time
options = Options()
options.add_argument("headless")
driver = webdriver.Edge(options = options)
driver.get('http://quote.eastmoney.com/center/gridlist.html#hs_a_board')
text=driver.page_source
tree=etree.HTML(text)
trs=tree.xpath('//*[@id="table_wrapper-table"]/tbody/tr')
ls=[]
for tr in trs:
tds=tr.xpath('./td//text()')
ls.append(tds)
con = sqlite3.connect("stock_data.db")
cursor = con.cursor()
cursor.execute("create table hushenjing("
"id int,"
"stocksymbol varchar(16),"
"stockname varchar(16),"
"LatestPrice varchar(16),"
"Pricelimit varchar(16),"
"Riseandfall varchar(16),"
"volume varchar(16),"
"turnover varchar(16),"
"amplitude varchar(16),"
"max varchar(16),"
"min varchar(16),"
"today varchar(16),"
"yesterday varchar(16))")
for l in ls:
cursor.execute("insert into hushenjing values(?,?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",(l[0],l[1],l[2],l[8],l[9],l[10],l[11],l[12],l[13],l[14],l[15],l[16],l[17]))
con.commit()
time.sleep(10)
button1=driver.find_element(By.XPATH,'//*[@id="tab"]/ul/li[2]/a')
button1.click()
time.sleep(10)
text1=driver.page_source
tree1=etree.HTML(text1)
trs1=tree1.xpath('//*[@id="table_wrapper-table"]/tbody/tr')
ls1=[]
for tr in trs1:
tds1=tr.xpath('./td//text()')
ls1.append(tds1)
cursor.execute("create table shangzhengagu("
"id int,"
"stocksymbol varchar(16),"
"stockname varchar(16),"
"LatestPrice varchar(16),"
"Pricelimit varchar(16),"
"Riseandfall varchar(16),"
"volume varchar(16),"
"turnover varchar(16),"
"amplitude varchar(16),"
"max varchar(16),"
"min varchar(16),"
"today varchar(16),"
"yesterday varchar(16))")
for l in ls1:
cursor.execute("insert into shangzhengagu values(?,?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",(l[0],l[1],l[2],l[8],l[9],l[10],l[11],l[12],l[13],l[14],l[15],l[16],l[17]))
con.commit()
time.sleep(10)
button2=driver.find_element(By.XPATH,'//*[@id="tab"]/ul/li[3]/a')
button2.click()
time.sleep(10)
text2=driver.page_source
tree2=etree.HTML(text2)
trs2=tree2.xpath('//*[@id="table_wrapper-table"]/tbody/tr')
ls2=[]
for tr in trs2:
tds2=tr.xpath('./td//text()')
ls2.append(tds2)
cursor.execute("create table shenzhengagu("
"id int,"
"stocksymbol varchar(16),"
"stockname varchar(16),"
"LatestPrice varchar(16),"
"Pricelimit varchar(16),"
"Riseandfall varchar(16),"
"volume varchar(16),"
"turnover varchar(16),"
"amplitude varchar(16),"
"max varchar(16),"
"min varchar(16),"
"today varchar(16),"
"yesterday varchar(16))")
for l in ls2:
cursor.execute("insert into shenzhengagu values(?,?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",(l[0],l[1],l[2],l[8],l[9],l[10],l[11],l[12],l[13],l[14],l[15],l[16],l[17]))
con.commit()
cursor.close()
con.close()
driver.quit()
结果如下:

心得体会:
通过这次实验,加深了自己对selenium的理解,学会如何爬取ajax数据,如何存储数据,本题总体难度不大。
作业②:
o 要求:
▪ 熟练掌握 Selenium 查找 HTML 元素、实现用户模拟登录、爬取 Ajax 网页数据、
等待 HTML 元素等内容。
▪ 使用 Selenium 框架+MySQL 爬取中国 mooc 网课程资源信息(课程号、课程名
称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)
o 候选网站:中国 mooc 网:https://www.icourse163.org
o 输出信息:MYSQL 数据库存储和输出格式
• Gitee 文件夹链接:作业2
代码如下:
爬取mooc
from selenium import webdriver
from selenium.webdriver.common.by import By
import sqlite3
from lxml import etree
import time
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
username = input("请输入你的账号: ") #输入账号
password = input("请输入你的密码: ") #输入密码
stunum = int(input("你准备爬取的课程个数是: ")) #最多爬取学号后两位数目的课程
options = Options()
options.add_argument("headless") #设置无头模式
options.add_argument('disable-gpu') #禁用gpu
options.add_argument('--disable-blink-features=AutomationControlled') #去掉了webdriver痕迹
options.use_chromium = True
driver = webdriver.Edge(options=options)
num = 1 #用来处理课程标签
i = 1 #用来统计课程数目
con = sqlite3.connect("mooc.db")
cursor = con.cursor()
cursor.execute("create table course("
"id int,"
"cCourse varchar(32),"
"cCollege varchar(32),"
"cTeacher varchar(16),"
"cCount varchar(100),"
"cProcess varchar(100),"
"cBrief varchar(100))") #创建数据库
driver.get('https://www.icourse163.org/')
time.sleep(3)
btn = driver.find_element(By.XPATH,'//*[@id="j-topnav"]/div') #找到注册与登录的按钮
btn.click() #点击按钮进入登录页面
time.sleep(3)
iframe = driver.find_element(By.XPATH,'//*[@id="j-ursContainer-0"]/iframe')
driver.switch_to.frame(iframe) #定位到iframe
input1 = driver.find_element(By.XPATH,'//*[@id="phoneipt"]') #找到输入账号的文本框
input1.send_keys(username)
time.sleep(2)
input2 = driver.find_element(By.XPATH,'//*[@id="login-form"]/div/div[4]/div[2]/input[2]') #找到输入密码的文本框
input2.send_keys(password)
time.sleep(2)
submit = driver.find_element(By.XPATH,'//*[@id="submitBtn"]') #找到登录按钮
submit.click() #点击进行登录
time.sleep(3)
driver.switch_to.default_content() #从iframe框架内退出,回到主文档,否则会找不到文档的标签
guojiajingpin = driver.find_element(By.XPATH,'//*[@id="app"]/div/div/div[1]/div[1]/div[1]/span[1]/a') #找到国家精品按钮并点击
guojiajingpin.click()
time.sleep(3)
window_handles = driver.window_handles #获得当前的所有窗户句柄
gjjpjb = window_handles[-1] #获得国家精品页面的句柄
while i <= stunum:
driver.switch_to.window(gjjpjb) #切换到国家精品的窗口
driver.maximize_window() #最大化窗口
if num >20: #一页只有20个课程,如果到第20页需要进行翻页操作
js = "window.scrollTo(0,document.body.scrollHeight)"
driver.execute_script(js) #滚轮移动到底部
nextbutton = driver.find_element(By.XPATH,'//*[@id="channel-course-list"]/div/div/div[2]/div[2]/div/a[10]') #找到下一页的按钮
driver.execute_script("arguments[0].click();", nextbutton) #JavaScript的点击,点击进入下一页的页面
time.sleep(2)
jss = "window.scrollTo(0,0)"
driver.execute_script(jss) #滚轮移动到最顶部
num = 1
driver.maximize_window() #最大化窗口
time.sleep(2)
src = '//*[@id="channel-course-list"]/div/div/div[2]/div[1]/div[' + str(num) + ']/div/div[1]' #找到课程按钮的标签
WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH,src))).click() #等待课程按钮可以点击然后进入点击后的课程页面
new_window_handles =driver.window_handles
driver.switch_to.window(new_window_handles[-1]) #跳转到课程页面
time.sleep(2)
newpagesource = driver.page_source #获得课程页面的页面源码
tree = etree.HTML(newpagesource)
ccourse = tree.xpath('//*[@id="g-body"]/div[1]/div/div[3]/div/div[1]/div[1]/span[1]/text()')[0] #找到课程名字
cprocess = tree.xpath('//*[@id="course-enroll-info"]/div/div[1]/div[2]/div/span[2]/text()')[0] #找到课程时间安排
ccount = tree.xpath('//*[@id="course-enroll-info"]/div/div[1]/div[4]/span[1]/text()')[0] #找到课程总课时
cteacher = tree.xpath('//*[@id="j-teacher"]/div/div/div[2]/div/div/div/div/div/h3/text()')[0] #找到课程主讲教师
ccollege = tree.xpath('//*[@id="j-teacher"]/div/a/img/@alt')[0] #找到课程的学校
# 找到课程的简介,因为课程简介存在的标签可能有多种,所以分情况考虑
if len(tree.xpath('//*[@id="j-rectxt2"]/text()'))!=0:
brief = tree.xpath('//*[@id="j-rectxt2"]/text()')[0]
elif len(tree.xpath('//*[@id="content-section"]/div[4]/div/p[1]/text()'))!=0:
brief = tree.xpath('//*[@id="content-section"]/div[4]/div/p[1]/text()')[0]
else:
brief = tree.xpath('//*[@id="content-section"]/div[4]/div/p[1]/span/text()')[0]
#对简介长度做出限制
if len(brief)<=30:
cbrief = brief[0:-1]+'...'
else:
cbrief = brief[0:30]+'...'
print(str(i)+" "+ccourse+" "+ccollege+" "+cteacher+" "+ccount+" "+cprocess+" "+cbrief)
cursor.execute("insert into course values(?, ?, ?, ?, ?, ?, ?)",
(i,ccourse,ccollege,cteacher,ccount,cprocess,cbrief)) #插入表中
con.commit()
driver.close() #关闭当前窗口
time.sleep(2)
num+= 1
i+= 1
cursor.close()
con.close()
driver.quit()
结果如下:

心得体会:
在刚开始处理登录时,需要掌握的是iframe的知识,定位到iframe内才能完成登录,登录成功后还得记得退出。
之后进入国家精品收集课程资料时,将课程一个个点进去获得页面源码解析,刚开始一直解析错误,xpath定位失败,后来发现要切换句柄。于此同时也要进行页面最大化操作,否则可能会定位不到按钮,本题收获较大,了解到了之前不知道的知识点,更加明白了selenium功能的强大。
作业③:
o 要求:
• 掌握大数据相关服务,熟悉 Xshell 的使用
• 完成文档 华为云_大数据实时分析处理实验手册-Flume 日志采集实验(部
分)v2.docx 中的任务,即为下面 5 个任务,具体操作见文档。
• 环境搭建:
·任务一:开通 MapReduce 服务
• 实时分析开发实战:
·任务一:Python 脚本生成测试数据
·任务二:配置 Kafka
·任务三: 安装 Flume 客户端
·任务四:配置 Flume 采集数据
输出:实验关键步骤或结果截图。
一.开通MapReduce服务:

二.python脚本生成测试数据:



三.配置Kafka:

四.安装Flume客户端:
下载客户端

解压文件

校验文件包

解压“MRS_Flume_ClientConfig.tar”文件

安装Flume环境变量

解压到flume客户端

安装到flume客户端

重启flume服务

五配置Flume采集数据:
修改配置文件

创建消费者消费kafka中的数据

