2023数据采集与融合技术实践作业四-JZTXT

作业①

实验内容

要求

熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。
候选网站：东方财富网：http://quote.eastmoney.com/center/gridlist.html#hs_a_board

输出信息

MYSQL数据库存储和输出格式如下，表头应是英文命名例如：序号id，股票代码：bStockNo……，由同学们自行定义设计表头：

序号	股票代码	股票名称	最新报价	涨跌幅	涨跌额	成交量	振幅	最高	最低	今开	昨收
1	688093	N世华	28.47	10.92	26.13万	7.6亿	22.34	32.0	28.08	30.20	17.55

Gitee文件夹链接

代码

数据库部分

点击查看代码

# 创建连接
conn = mysql.connector.connect(user='username', password='password', host='localhost', database='share')
cursor = conn.cursor()

# 检查表是否存在
cursor.execute("SHOW TABLES LIKE 'SHARE'")
table_exists = cursor.fetchone()

# 如果表不存在，则创建表
if not table_exists:
    cursor.execute('''CREATE TABLE SHARE
           (ID INT AUTO_INCREMENT PRIMARY KEY,
            CODE           VARCHAR(255) NOT NULL,
            NAME           VARCHAR(255) NOT NULL,
            PRICE          VARCHAR(255) NOT NULL,
            CHANGE         VARCHAR(255) NOT NULL,
            CHANGE_AMOUNT  VARCHAR(255) NOT NULL,
            VOLUME         VARCHAR(255) NOT NULL,
            AMOUNT         VARCHAR(255) NOT NULL,
            AMPLITUDE      VARCHAR(255) NOT NULL,
            HIGH           VARCHAR(255) NOT NULL,
            LOW            VARCHAR(255) NOT NULL,
            OPEN           VARCHAR(255) NOT NULL,
            CLOSE          VARCHAR(255) NOT NULL)''')

conn.close()

def insert_data(code, name, price, change, change_amount, volume, amount, amplitude, high, low, open, close):
    conn = mysql.connector.connect(user='username', password='password', host='localhost', database='share')
    cursor = conn.cursor()
    cursor.execute(f"INSERT INTO SHARE (CODE, NAME, PRICE, CHANGE, CHANGE_AMOUNT, VOLUME, AMOUNT, AMPLITUDE, HIGH, LOW, OPEN, CLOSE) VALUES \
                 ('{code}', '{name}', '{price}', '{change}', '{change_amount}', '{volume}', '{amount}', '{amplitude}', '{high}', '{low}', '{open}', '{close}')")
    conn.commit()
    conn.close()

数据处理，抓取表格内容

def process_data(driver):
    table = driver.find_element(By.ID, "table_wrapper-table")
    tbody = table.find_element(By.TAG_NAME, "tbody")
    lines = tbody.find_elements(By.TAG_NAME, "tr")
    for line in lines:
        code = line.find_elements(By.TAG_NAME, "td")[1].text
        name = line.find_elements(By.TAG_NAME, "td")[2].text
        latest_price = line.find_elements(By.TAG_NAME, "td")[4].text
        growth_rate = line.find_elements(By.TAG_NAME, "td")[5].text
        growth_amount = line.find_elements(By.TAG_NAME, "td")[6].text
        volume = line.find_elements(By.TAG_NAME, "td")[7].text
        turnover = line.find_elements(By.TAG_NAME, "td")[8].text
        amplitude = line.find_elements(By.TAG_NAME, "td")[9].text
        highest = line.find_elements(By.TAG_NAME, "td")[10].text
        lowest = line.find_elements(By.TAG_NAME, "td")[11].text
        today_open = line.find_elements(By.TAG_NAME, "td")[12].text
        yesterday_close = line.find_elements(By.TAG_NAME, "td")[13].text
        insert_data(code, name, latest_price, growth_rate, growth_amount, volume, turnover, amplitude, highest, lowest, today_open, yesterday_close)

抓取页面内容

# 三个板块
urls = ["http://quote.eastmoney.com/center/gridlist.html#hs_a_board",
       "http://quote.eastmoney.com/center/gridlist.html#sh_a_board",
       "http://quote.eastmoney.com/center/gridlist.html#sz_a_board"       
       ]
for url in urls:
    try:
        driver.get(url)
        print("爬取完成")
    except Exception as err:
        print(err)

    count = 0
    while count < 15:
        print("爬取第",count+1,"页")
        time.sleep(0.2)
        process_data(driver)
        # 处理翻页
        next_page = driver.find_element(By.CLASS_NAME, "next")
        next_page.click()
        count += 1

结果

实验心得

这次实验实现了使用Selenium框架抓取网页动态内容，以及翻页处理。通过实践，我对于Selenium框架以及mysql有了更深的理解。同时我也明白了Selenium的操作需要等待网页加载完毕，有时必须等待，否则会出现错误。

作业②

实验内容

要求

熟练掌握 Selenium 查找HTML元素、实现用户模拟登录、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架+MySQL爬取中国mooc网课程资源信息（课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介）
候选网站：中国mooc网：https://www.icourse163.org

输出信息

MYSQL数据库存储和输出格式

Id	cCourse	cCollege	cTeacher	cTeam	cCount	cProcess	cBrief
1	Python数据分析与展示	北京理工大学	嵩天	嵩天	470	2020年11月17日 ~ 2020年12月29日	“我们正步入一个数据或许比软件更重要的新时代。——Tim O'Reilly” ……

Gitee文件夹链接

代码

登录实现

username = "username"
password = "password"
url = "https://www.icourse163.org/member/login.htm"
try:
    driver.get(url)
except Exception as err:
    print(err)
time.sleep(1)

login_by_email = driver.find_element(By.XPATH, '//*[@id="login-cnt"]/div/div/div/div/div[1]/div/div[1]/div[2]/div[1]/ul/li[2]')
login_by_email.click()

iframe_id = driver.find_element(By.TAG_NAME, 'iframe').get_attribute('id')
# 切换进iframe
driver.switch_to.frame(iframe_id)
time.sleep(3)
input_username = driver.find_element(By.XPATH, '//*[@id="phoneipt"]')
input_username.send_keys(username)
input_password = driver.find_element(By.XPATH, '//*[@id="auto-id-1698924755946"]/div[3]/div[2]/input[1]')
input_password.send_keys(password)
time.sleep(1)
login_button = driver.find_element(By.XPATH, '//*[@id="auto-id-1698924755946"]/div[3]/div[2]/a')
login_button.click()
# 切出iframe
driver.switch_to.default_content()

信息抓取，内容为国家精品课

url = "https://www.icourse163.org/"

try:
    driver.get(url)
    print("爬取完成")
except Exception as err:
    print(err)

data = driver.find_element(By.XPATH, "//*[@id='app']/div/div/div[8]/div[2]").text
class_list = data.split("0\n")
class_list.pop(0)
for i in range(len(class_list)):
    class_list[i] = class_list[i].replace("\n", " ").strip().split(" ")

# 抓取课程详细信息，点击各个课程，获取课程进度和简介
link = driver.find_element(By.XPATH, "//*[@id='app']/div/div/div[8]/div[2]").find_elements(By.CLASS_NAME, "commonCourseCardItem")
window_handles = driver.window_handles
index_key = window_handles[0]
for i in range(len(link)):
    window_handles = driver.window_handles
    while len(window_handles) > 1:
        driver.switch_to.window(window_handles[-1])
        driver.close()
        driver.switch_to.window(index_key)
    try:
        link[i].click()
        window_handles = driver.window_handles
        driver.switch_to.window(window_handles[-1])
        time.sleep(5)

        class_process = driver.find_element(By.XPATH, "//*[@id='course-enroll-info']/div/div[1]/div[2]/div/span[2]").text
        intro = driver.find_element(By.XPATH, "//*[@class='course-heading-intro_intro']").text
        print(class_process, intro)
        class_list[i].append(class_process)
        class_list[i].append(intro)
        driver.close()

        driver.switch_to.window(index_key)
        time.sleep(5)
    except Exception as err:
        print(err)

数据库部分

点击查看代码

# 创建数据库连接
conn = mysql.connector.connect(user='username', password='password', host='localhost', database='class')
cursor = conn.cursor()

# 检查表是否存在
cursor.execute("SHOW TABLES LIKE 'CLASS'")
table_exists = cursor.fetchone()

# 如果表不存在，则创建表
if not table_exists:
    cursor.execute('''CREATE TABLE CLASS
           (ID INT AUTO_INCREMENT PRIMARY KEY,
            cCourse    VARCHAR(255) NOT NULL,
            cCollege   VARCHAR(255) NOT NULL,
            cTeacher   VARCHAR(255) NOT NULL,
            cTeam      VARCHAR(255) NOT NULL,
            cCount     VARCHAR(255) NOT NULL,
            cProcess   VARCHAR(255) NOT NULL,
            cBrief     VARCHAR(255) NOT NULL)''')

conn.close()

def insert_data(cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief):
    conn = mysql.connector.connect(user='username', password='password', host='localhost', database='class')
    cursor = conn.cursor()
    cursor.execute(f"INSERT INTO CLASS (cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief) VALUES \
                 ('{cCourse}', '{cCollege}', '{cTeacher}', '{cTeam}', '{cCount}', '{cProcess}', '{cBrief}')")
    conn.commit()
    conn.close()
# 插入数据
for i in class_list:
    if len(i) < 7:
        i.append(i[4])
        i.append(" ")
    insert_data(i[0], i[1], i[2], i[2], i[3], i[5], i[6])