# -*- coding: utf-8 -*-
import urllib.error
import lxml
from lxml import etree
from urllib.parse import urljoin
from html import unescape
import re
import math
import requests
from pyquery import PyQuery as pq
# 计算两个字符串的余弦想相似度
def strsim(x: str, y: str):
xlen, ylen = len(x), len(y)
if not xlen or not ylen:
return 0
if xlen > ylen:
ratio = xlen / ylen
else:
ratio = ylen / xlen
if ratio > 3:
return 0
return lcs(x, y) / max(xlen, ylen)
def lcs(x: str, y: str):
xlen, ylen = len(x), len(y)
if not xlen or not ylen:
return 0
opt = []
for i in range(xlen + 1):
opt.append([0 for _ in range(ylen + 1)])
for i in range(xlen - 1, -1, -1):
for j in range(ylen - 1, -1, -1):
if x[i] == y[j]:
opt[i][j] = opt[i + 1][j + 1] + 1
else:
opt[i][j] = max(opt[i + 1][j], opt[i][j + 1])
return opt[0][0]
class CountInfo(object):
"""节点数据统计分析类"""
def __init__(self):
self.text_count = 0
self.link_text_count = 0
self.tag_count = 0
self.link_tag_count = 0
self.density = 0.0
self.density_sum = 0.0
self.score = 0.0
self.pcount = 0
self.leaflist = []
class ContentExtractor:
"""内容提取类"""
def __init__(self):
self.doc = None
self._infomap = {}
self._blockinfo = {}
self.url = None
self.newline = ["div", "li", "p", "h1", "h2", "h3", "h4", "h5", "tr", "img",
"br", "thead", "tbody", "hr", "section", "article", "ul", "ol"]
self.top_node = None
self.title = ""
self._titlelen = 0
self.clean_text = ""
self.format_text = ""
self.score = 0.0
self.link_text_ratio = 0.0
self.text_count = 0
self.img_count = 0
self.findtitle = True
self.title_tmp = ""
self.raw = ""
self.__htmlattr = re.compile(r'<([a-z]+\w*)[^>]*>', re.I)
def extract(self, url, html_source):
"""主要提取函数入口"""
try:
clean_html = self.clean_tag(html_source)
clean_html = unescape(clean_html)
self.doc = etree.HTML(clean_html)
except (TypeError, ValueError, etree.XMLSyntaxError):
return
if self.doc is None:
return
self.url = url
self.title = self.get_title()
if not self.title:
return
self._titlelen = len(self.title)
self.score, self.link_text_ratio = self.get_top_node()
if self.top_node is None:
return
self.remove_link_block()
if (len(self.title_tmp) / float(self._titlelen)) > 0.2:
self.title = self.title_tmp
self.raw = self.remove_htmlattr(self.top_node)
content = self.output_format(self.top_node)
self.clean_text = "\n".join([t if "img" not in t else
"" for t in content.split('\n')])
for text in content.split("\n"):
if "img" in text:
self.img_count += 1
self.format_text += '<p align="center">%s</p>' % text
else:
text = text.strip()
if not text:
continue
self.text_count += len(text)
if '相关文章' in text:
continue
self.format_text += '<p>%s</p>' % text
return True
def abstracturl(self, urlpath):
"""URL相对链接补全"""
return urljoin(self.url, urlpath)
def get_title(self):
"""自动获取文章标题"""
title = ''.join(self.doc.xpath('//title/text()')).strip()
if title == "":
return ""
titles = self.doc.xpath(
'//h1|//h2|//h3|//*[contains(@class, "title")]|//*[contains(@id, "title")]')
if not titles:
return title
ok_title = title
max_sim = 0
for tt in titles:
sim = strsim(tt, title)
if sim > max_sim:
ok_title = tt
max_sim = sim
return ok_title
@staticmethod
def clean_tag(doc):
"""去除掉script,noscript,style,iframe,br等标签"""
doc = re.sub(r'<script.*?>.*?</script>', '', doc, flags=(re.I | re.S))
doc = re.sub(r'<noscript.*?>.*?</noscript>',
'', doc, flags=(re.I | re.S))
doc = re.sub(r'<style.*?>.*?</style>', '', doc, flags=(re.I | re.S))
doc = re.sub(r'<iframe.*?>.*?</iframe>', '', doc, flags=(re.I | re.S))
doc = re.sub(r'[\r\t]+', '', doc)
doc = re.sub(r'<br\s*/?>', '\n', doc, flags=re.I)
doc = re.sub(r'<!--.*?-->', '', doc, flags=re.S)
doc = re.sub(r' ', ' ', doc, flags=re.S)
return doc
@staticmethod
def contents(node):
"""提取节点的所有文本以及子节点"""
result = []
result.extend(node.xpath("child::text()|child::*"))
return result
def calcuate(self, node, record):
"""计算各个节点的信息"""
if etree.iselement(node):
info = CountInfo()
for elem in self.contents(node):
childinfo = self.calcuate(elem, record)
info.text_count += childinfo.text_count
info.link_text_count += childinfo.link_text_count
info.tag_count += childinfo.tag_count
info.link_tag_count += childinfo.link_tag_count
info.leaflist.extend(childinfo.leaflist)
info.density_sum += childinfo.density
info.pcount += childinfo.pcount
info.tag_count += 1
tagname = node.tag
if tagname == "a":
info.link_text_count = info.text_count
info.link_tag_count += 1
elif tagname == "p":
info.pcount += 1
purelen = info.text_count - info.link_text_count
not_link_tag_num = info.tag_count - info.link_tag_count
if purelen == 0 or not_link_tag_num == 0:
info.density = 0
else:
info.density = float(purelen) / not_link_tag_num
record[node] = info
return info
elif hasattr(node, "is_text"):
info = CountInfo()
nodetext = node.strip()
txtlen = len(nodetext)
info.text_count = txtlen
info.leaflist.append(txtlen)
tmp_len = len(self.title_tmp)
if self.findtitle and tmp_len < txtlen <= self._titlelen \
and self.title.startswith(nodetext):
self.title_tmp = nodetext
return info
else:
return CountInfo()
def calcuate_score(self, node):
"""计算节点得分"""
info = self._infomap.get(node)
if info is None:
return 0.0
val = math.sqrt(self.calcuate_var(info.leaflist) + 1)
return math.log(val) * info.density_sum * \
math.log(float(info.text_count -
info.link_text_count) + 1.0) * \
math.log10(float(info.pcount) + 2.0)
@staticmethod
def calcuate_var(leafs):
"""计算平均分"""
leaf_len = len(leafs)
if leaf_len <= 0:
return 0.0
if leaf_len == 1:
return leafs[0] / 2.0
sums = 0.0
for v in leafs:
sums += float(v)
ave = sums / leaf_len
sums = 0.0
for v in leafs:
sums += (float(v) - ave) ** 2
sums /= leaf_len
return sums
def get_top_node(self):
"""获取内容主体节点"""
self.findtitle = True
body = self.doc.find("body")
self.calcuate(body, self._infomap)
max_score = 0.0
link_text_ratio = 0.0
for node, info in self._infomap.items():
tagname = node.tag
if tagname in ["a", "body"]:
continue
score = self.calcuate_score(node)
if score > max_score:
max_score = score
self.top_node = node
try:
link_text_ratio = info.link_text_count / \
float(info.text_count)
except ZeroDivisionError:
pass
return max_score, link_text_ratio
def remove_htmlattr(self, node):
"""
删除HTML所有属性和a标签链接
"""
raw = etree.tounicode(node)
raw = re.sub('</?(?:div|a)[^>]*>', '', raw, flags=re.I)
raw = re.sub(r'<img[^>]*?(?:src|data-src)\s?=\s?[\'"]?([^\'"]+)[^>]*>',
'<#img src="\g<1>" />', raw, flags=re.I)
raw = self.__htmlattr.sub('<\g<1>>', raw)
raw = re.sub(r'<#img', '<img', raw).strip()
return re.sub(r'[\r\n\t]+|</?div>', '', self.remove_empty_node(raw))
def remove_empty_node(self, text):
"""
删除没有内容的标签,和多层套娃无内容标签
"""
text = unescape(text)
try:
doc = pq(text)
except (lxml.etree.ParserError, lxml.etree.XMLSyntaxError,
requests.exceptions.InvalidURL, urllib.error.HTTPError):
return text
except Exception:
return text
for item in doc.contents():
if etree.iselement(item):
if ''.join(item.itertext()).strip() == "":
hasimg = pq(item).find("img")
if item.tag not in ['img', 'br'] and not hasimg.attr('src'):
pq(item).remove()
else:
self.remove_empty_node(item)
return str(doc)
def output_format(self, cnode):
"""格式化内容输出"""
content = ""
for node in self.contents(cnode):
if hasattr(node, "is_text"):
content += node
elif etree.iselement(node):
if node.tag in self.newline:
content += "\n"
if node.tag == "img":
src = node.attrib.get("data-src", "") \
or node.attrib.get("src", "")
src = self.abstracturl(src)
if src:
content += '<img src="%s" />' % src
content += self.output_format(node)
return content.strip()
def remove_link_block(self):
"""删除链接块"""
self.findtitle = False
self.calcuate(self.top_node, self._blockinfo)
for node, info in self._blockinfo.items():
if node.tag == "a":
continue
try:
link_text_ratio = info.link_text_count / info.text_count
except ZeroDivisionError:
continue
if link_text_ratio > 0.5:
parentnode = node.getparent()
if etree.iselement(parentnode):
parentnode.remove(node)
if __name__ == '__main__':
r = requests.get('http://www.fanwenbaba.cn/nianzhongzongjie/43952.html', timeout=10)
r.encoding = 'utf-8'
html = r.text
ex = ContentExtractor()
ex.extract('http://www.fanwenbaba.cn/nianzhongzongjie/43952.html', html)
print(ex.title)
print("=" * 100)
print(ex.raw)
提取新闻类正文数据