新闻类网页抽取-JZTXT

新闻类网页抽取
# -*- coding: utf-8 -*-
import urllib.error

import lxml
from lxml import etree
from urllib.parse import urljoin
from html import unescape
import re
import math
import requests
from pyquery import PyQuery as pq


# 计算两个字符串的余弦想相似度
def strsim(x: str, y: str):
    xlen, ylen = len(x), len(y)
    if not xlen or not ylen:
        return 0
    if xlen > ylen:
        ratio = xlen / ylen
    else:
        ratio = ylen / xlen
    if ratio > 3:
        return 0
    return lcs(x, y) / max(xlen, ylen)


def lcs(x: str, y: str):
    xlen, ylen = len(x), len(y)
    if not xlen or not ylen:
        return 0
    opt = []
    for i in range(xlen + 1):
        opt.append([0 for _ in range(ylen + 1)])

    for i in range(xlen - 1, -1, -1):
        for j in range(ylen - 1, -1, -1):
            if x[i] == y[j]:
                opt[i][j] = opt[i + 1][j + 1] + 1
            else:
                opt[i][j] = max(opt[i + 1][j], opt[i][j + 1])
    return opt[0][0]



class CountInfo(object):
    """节点数据统计分析类"""

    def __init__(self):
        self.text_count = 0
        self.link_text_count = 0
        self.tag_count = 0
        self.link_tag_count = 0
        self.density = 0.0
        self.density_sum = 0.0
        self.score = 0.0
        self.pcount = 0
        self.leaflist = []


class ContentExtractor:
    """内容提取类"""

    def __init__(self):
        self.doc = None
        self._infomap = {}
        self._blockinfo = {}
        self.url = None
        self.newline = ["div", "li", "p", "h1", "h2", "h3", "h4", "h5", "tr", "img",
                        "br", "thead", "tbody", "hr", "section", "article", "ul", "ol"]
        self.top_node = None
        self.title = ""
        self._titlelen = 0
        self.clean_text = ""
        self.format_text = ""
        self.score = 0.0
        self.link_text_ratio = 0.0
        self.text_count = 0
        self.img_count = 0
        self.findtitle = True
        self.title_tmp = ""
        self.raw = ""
        self.__htmlattr = re.compile(r'<([a-z]+\w*)[^>]*>', re.I)

    def extract(self, url, html_source):
        """主要提取函数入口"""
        try:
            clean_html = self.clean_tag(html_source)
            clean_html = unescape(clean_html)
            self.doc = etree.HTML(clean_html)
        except (TypeError, ValueError, etree.XMLSyntaxError):
            return
        if self.doc is None:
            return
        self.url = url
        self.title = self.get_title()
        if not self.title:
            return
        self._titlelen = len(self.title)
        self.score, self.link_text_ratio = self.get_top_node()
        if self.top_node is None:
            return
        self.remove_link_block()
        if (len(self.title_tmp) / float(self._titlelen)) > 0.2:
            self.title = self.title_tmp
        self.raw = self.remove_htmlattr(self.top_node)
        content = self.output_format(self.top_node)
        self.clean_text = "\n".join([t if "img" not in t else
                                     "" for t in content.split('\n')])
        for text in content.split("\n"):
            if "img" in text:
                self.img_count += 1
                self.format_text += '<p align="center">%s</p>' % text
            else:
                text = text.strip()
                if not text:
                    continue
                self.text_count += len(text)
                if '相关文章' in text:
                    continue
                self.format_text += '<p>%s</p>' % text
        return True

    def abstracturl(self, urlpath):
        """URL相对链接补全"""
        return urljoin(self.url, urlpath)

    def get_title(self):
        """自动获取文章标题"""
        title = ''.join(self.doc.xpath('//title/text()')).strip()
        if title == "":
            return ""
        titles = self.doc.xpath(
            '//h1|//h2|//h3|//*[contains(@class, "title")]|//*[contains(@id, "title")]')
        if not titles:
            return title
        ok_title = title
        max_sim = 0
        for tt in titles:
            sim = strsim(tt, title)
            if sim > max_sim:
                ok_title = tt
                max_sim = sim
        return ok_title

    @staticmethod
    def clean_tag(doc):
        """去除掉script,noscript,style,iframe,br等标签"""
        doc = re.sub(r'<script.*?>.*?</script>', '', doc, flags=(re.I | re.S))
        doc = re.sub(r'<noscript.*?>.*?</noscript>',
                     '', doc, flags=(re.I | re.S))
        doc = re.sub(r'<style.*?>.*?</style>', '', doc, flags=(re.I | re.S))
        doc = re.sub(r'<iframe.*?>.*?</iframe>', '', doc, flags=(re.I | re.S))
        doc = re.sub(r'[\r\t]+', '', doc)
        doc = re.sub(r'<br\s*/?>', '\n', doc, flags=re.I)
        doc = re.sub(r'<!--.*?-->', '', doc, flags=re.S)
        doc = re.sub(r' ', ' ', doc, flags=re.S)
        return doc

    @staticmethod
    def contents(node):
        """提取节点的所有文本以及子节点"""
        result = []
        result.extend(node.xpath("child::text()|child::*"))
        return result

    def calcuate(self, node, record):
        """计算各个节点的信息"""
        if etree.iselement(node):
            info = CountInfo()
            for elem in self.contents(node):
                childinfo = self.calcuate(elem, record)
                info.text_count += childinfo.text_count
                info.link_text_count += childinfo.link_text_count
                info.tag_count += childinfo.tag_count
                info.link_tag_count += childinfo.link_tag_count
                info.leaflist.extend(childinfo.leaflist)
                info.density_sum += childinfo.density
                info.pcount += childinfo.pcount

            info.tag_count += 1
            tagname = node.tag
            if tagname == "a":
                info.link_text_count = info.text_count
                info.link_tag_count += 1
            elif tagname == "p":
                info.pcount += 1

            purelen = info.text_count - info.link_text_count
            not_link_tag_num = info.tag_count - info.link_tag_count

            if purelen == 0 or not_link_tag_num == 0:
                info.density = 0
            else:
                info.density = float(purelen) / not_link_tag_num
            record[node] = info
            return info
        elif hasattr(node, "is_text"):
            info = CountInfo()
            nodetext = node.strip()
            txtlen = len(nodetext)
            info.text_count = txtlen
            info.leaflist.append(txtlen)
            tmp_len = len(self.title_tmp)
            if self.findtitle and tmp_len < txtlen <= self._titlelen \
                    and self.title.startswith(nodetext):
                self.title_tmp = nodetext
            return info
        else:
            return CountInfo()

    def calcuate_score(self, node):
        """计算节点得分"""
        info = self._infomap.get(node)
        if info is None:
            return 0.0
        val = math.sqrt(self.calcuate_var(info.leaflist) + 1)
        return math.log(val) * info.density_sum * \
            math.log(float(info.text_count -
                           info.link_text_count) + 1.0) * \
            math.log10(float(info.pcount) + 2.0)

    @staticmethod
    def calcuate_var(leafs):
        """计算平均分"""
        leaf_len = len(leafs)
        if leaf_len <= 0:
            return 0.0
        if leaf_len == 1:
            return leafs[0] / 2.0
        sums = 0.0
        for v in leafs:
            sums += float(v)
        ave = sums / leaf_len
        sums = 0.0
        for v in leafs:
            sums += (float(v) - ave) ** 2
        sums /= leaf_len
        return sums

    def get_top_node(self):
        """获取内容主体节点"""
        self.findtitle = True
        body = self.doc.find("body")
        self.calcuate(body, self._infomap)
        max_score = 0.0
        link_text_ratio = 0.0
        for node, info in self._infomap.items():
            tagname = node.tag
            if tagname in ["a", "body"]:
                continue
            score = self.calcuate_score(node)
            if score > max_score:
                max_score = score
                self.top_node = node
                try:
                    link_text_ratio = info.link_text_count / \
                        float(info.text_count)
                except ZeroDivisionError:
                    pass
        return max_score, link_text_ratio

    def remove_htmlattr(self, node):
        """
        删除HTML所有属性和a标签链接
        """
        raw = etree.tounicode(node)
        raw = re.sub('</?(?:div|a)[^>]*>', '', raw, flags=re.I)
        raw = re.sub(r'<img[^>]*?(?:src|data-src)\s?=\s?[\'"]?([^\'"]+)[^>]*>',
                     '<#img src="\g<1>" />', raw, flags=re.I)
        raw = self.__htmlattr.sub('<\g<1>>', raw)
        raw = re.sub(r'<#img', '<img', raw).strip()
        return re.sub(r'[\r\n\t]+|</?div>', '', self.remove_empty_node(raw))

    def remove_empty_node(self, text):
        """
        删除没有内容的标签，和多层套娃无内容标签
        """
        text = unescape(text)
        try:
            doc = pq(text)
        except (lxml.etree.ParserError, lxml.etree.XMLSyntaxError,
                requests.exceptions.InvalidURL, urllib.error.HTTPError):
            return text
        except Exception:
            return text
        for item in doc.contents():
            if etree.iselement(item):
                if ''.join(item.itertext()).strip() == "":
                    hasimg = pq(item).find("img")
                    if item.tag not in ['img', 'br'] and not hasimg.attr('src'):
                        pq(item).remove()
                else:
                    self.remove_empty_node(item)
        return str(doc)

    def output_format(self, cnode):
        """格式化内容输出"""
        content = ""
        for node in self.contents(cnode):
            if hasattr(node, "is_text"):
                content += node
            elif etree.iselement(node):
                if node.tag in self.newline:
                    content += "\n"
                if node.tag == "img":
                    src = node.attrib.get("data-src", "") \
                        or node.attrib.get("src", "")
                    src = self.abstracturl(src)
                    if src:
                        content += '<img src="%s" />' % src
                content += self.output_format(node)
        return content.strip()

    def remove_link_block(self):
        """删除链接块"""
        self.findtitle = False
        self.calcuate(self.top_node, self._blockinfo)
        for node, info in self._blockinfo.items():
            if node.tag == "a":
                continue
            try:
                link_text_ratio = info.link_text_count / info.text_count
            except ZeroDivisionError:
                continue
            if link_text_ratio > 0.5:
                parentnode = node.getparent()
                if etree.iselement(parentnode):
                    parentnode.remove(node)



if __name__ == '__main__':
    r = requests.get('http://www.fanwenbaba.cn/nianzhongzongjie/43952.html', timeout=10)
    r.encoding = 'utf-8'
    html = r.text
    ex = ContentExtractor()
    ex.extract('http://www.fanwenbaba.cn/nianzhongzongjie/43952.html', html)
    print(ex.title)
    print("=" * 100)
    print(ex.raw)
提取新闻类正文数据