htmlparser.py 59 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247
  1. #coding:utf8
  2. import re
  3. # from BaseDataMaintenance.maintenance.product.productUtils import is_similar
  4. # from BiddingKG.dl.common.Utils import log
  5. import logging
  6. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  7. logger = logging.getLogger(__name__)
  8. logger.setLevel(logging.INFO)
  9. def log(msg):
  10. '''
  11. @summary:打印信息
  12. '''
  13. logger.info(msg)
  14. from bs4 import BeautifulSoup
  15. import copy
  16. import Levenshtein
  17. def jaccard_score(source,target):
  18. source_set = set([s for s in source])
  19. target_set = set([s for s in target])
  20. if len(source_set)==0 or len(target_set)==0:
  21. return 0
  22. return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
  23. def judge_pur_chinese(keyword):
  24. """
  25. 中文字符的编码范围为: u'\u4e00' -- u'\u9fff:只要在此范围内就可以判断为中文字符串
  26. @param keyword:
  27. @return:
  28. """
  29. # 定义一个需要删除的标点符号字符串列表
  30. remove_chars = '[·’!"\#$%&\'()#!()*+,-./:;<=>?\@,:?¥★、….>【】[]《》?“”‘’\[\\]^_`{|}~]+'
  31. # 利用re.sub来删除中文字符串中的标点符号
  32. strings = re.sub(remove_chars, "", keyword) # 将keyword中文字符串中remove_chars中包含的标点符号替换为空字符串
  33. for ch in strings:
  34. if u'\u4e00' <= ch <= u'\u9fff':
  35. pass
  36. else:
  37. return False
  38. return True
  39. def is_similar(source,target,_radio=None):
  40. source = str(source).lower()
  41. target = str(target).lower()
  42. max_len = max(len(source),len(target))
  43. min_len = min(len(source),len(target))
  44. min_ratio = 90
  45. if min_len>=3:
  46. min_ratio = 87
  47. if min_len>=5:
  48. min_ratio = 85
  49. if _radio is not None:
  50. min_ratio = _radio
  51. # dis_len = abs(len(source)-len(target))
  52. # min_dis = min(max_len*0.2,4)
  53. if min_len==0 and max_len>0:
  54. return False
  55. if max_len<=2:
  56. if source==target:
  57. return True
  58. if min_len<2:
  59. return False
  60. #判断相似度
  61. similar = Levenshtein.ratio(source,target)*100
  62. if similar>=min_ratio:
  63. log("%s and %s similar_jaro %d"%(source,target,similar))
  64. return True
  65. similar_jaro = Levenshtein.jaro(source,target)
  66. if similar_jaro*100>=min_ratio:
  67. log("%s and %s similar_jaro %d"%(source,target,similar_jaro*100))
  68. return True
  69. similar_jarow = Levenshtein.jaro_winkler(source,target)
  70. if similar_jarow*100>=min_ratio:
  71. log("%s and %s similar_jaro %d"%(source,target,similar_jarow*100))
  72. return True
  73. if min_len>=5:
  74. if len(source)==max_len and str(source).find(target)>=0:
  75. return True
  76. elif len(target)==max_len and target.find(source)>=0:
  77. return True
  78. elif jaccard_score(source, target)==1 and judge_pur_chinese(source) and judge_pur_chinese(target):
  79. return True
  80. return False
  81. end_pattern = "商务要求|评分标准|商务条件|商务条件"
  82. _param_pattern = "(产品|技术|清单|配置|参数|具体|明细|项目|招标|货物|服务|规格|工作|具体)[及和与]?(指标|配置|条件|要求|参数|需求|规格|条款|名称及要求)|配置清单|(质量|技术).{,10}要求|验收标准|^(参数|功能)$"
  83. meter_pattern = "[><≤≥±]\d+|\d+(?:[μucmkK微毫千]?[米升LlgGmMΩ]|摄氏度|英寸|度|天|VA|dB|bpm|rpm|kPa|mol|cmH20|%|°|Mpa|Hz|K?HZ|℃|W|min|[*×xX])|[*×xX]\d+|/min|\ds[^a-zA-Z]|GB.{,20}标准|PVC|PP|角度|容积|色彩|自动|流量|外径|轴位|折射率|帧率|柱镜|振幅|磁场|镜片|防漏|强度|允差|心率|倍数|瞳距|底座|色泽|噪音|间距|材质|材料|表面|频率|阻抗|浓度|兼容|防尘|防水|内径|实时|一次性|误差|性能|距离|精确|温度|超温|范围|跟踪|对比度|亮度|[横纵]向|均压|负压|正压|可调|设定值|功能|检测|高度|厚度|宽度|深度|[单双多]通道|效果|指数|模式|尺寸|重量|峰值|谷值|容量|寿命|稳定性|高温|信号|电源|电流|转换率|效率|释放量|转速|离心力|向心力|弯曲|电压|功率|气量|国标|标准协议|灵敏度|最大值|最小值|耐磨|波形|高压|性强|工艺|光源|低压|压力|压强|速度|湿度|重量|毛重|[MLX大中小]+码|净重|颜色|[红橙黄绿青蓝紫]色|不锈钢|输入|输出|噪声|认证|配置"
  84. not_meter_pattern = "投标报价|中标金额|商务部分|公章|分值构成|业绩|详见|联系人|联系电话|合同价|金额|采购预算|资金来源|费用|质疑|评审因素|评审标准|商务资信|商务评分|专家论证意见|评标方法|代理服务费|售后服务|评分类型|评分项目|预算金额|得\d+分|项目金额|详见招标文件|乙方"
  85. def getTrs(tbody):
  86. #获取所有的tr
  87. trs = []
  88. if tbody.name=="table":
  89. body = tbody.find("tbody",recursive=False)
  90. if body is not None:
  91. tbody = body
  92. objs = tbody.find_all(recursive=False)
  93. for obj in objs:
  94. if obj.name=="tr":
  95. trs.append(obj)
  96. if obj.name=="tbody" or obj.name=="table":
  97. for tr in obj.find_all("tr",recursive=False):
  98. trs.append(tr)
  99. return trs
  100. def fixSpan(tbody):
  101. # 处理colspan, rowspan信息补全问题
  102. #trs = tbody.findChildren('tr', recursive=False)
  103. trs = getTrs(tbody)
  104. ths_len = 0
  105. ths = list()
  106. trs_set = set()
  107. #修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱
  108. # 遍历每一个tr
  109. for indtr, tr in enumerate(trs):
  110. ths_tmp = tr.findChildren('th', recursive=False)
  111. #不补全含有表格的tr
  112. if len(tr.findChildren('table'))>0:
  113. continue
  114. if len(ths_tmp) > 0:
  115. ths_len = ths_len + len(ths_tmp)
  116. for th in ths_tmp:
  117. ths.append(th)
  118. trs_set.add(tr)
  119. # 遍历每行中的element
  120. tds = tr.findChildren(recursive=False)
  121. for indtd, td in enumerate(tds):
  122. # 若有colspan 则补全同一行下一个位置
  123. if 'colspan' in td.attrs:
  124. if str(re.sub("[^0-9]","",str(td['colspan'])))!="":
  125. col = int(re.sub("[^0-9]","",str(td['colspan'])))
  126. if col<100 and len(td.get_text())<1000:
  127. td['colspan'] = 1
  128. for i in range(1, col, 1):
  129. td.insert_after(copy.copy(td))
  130. for indtr, tr in enumerate(trs):
  131. ths_tmp = tr.findChildren('th', recursive=False)
  132. #不补全含有表格的tr
  133. if len(tr.findChildren('table'))>0:
  134. continue
  135. if len(ths_tmp) > 0:
  136. ths_len = ths_len + len(ths_tmp)
  137. for th in ths_tmp:
  138. ths.append(th)
  139. trs_set.add(tr)
  140. # 遍历每行中的element
  141. tds = tr.findChildren(recursive=False)
  142. for indtd, td in enumerate(tds):
  143. # 若有rowspan 则补全下一行同样位置
  144. if 'rowspan' in td.attrs:
  145. if str(re.sub("[^0-9]","",str(td['rowspan'])))!="":
  146. row = int(re.sub("[^0-9]","",str(td['rowspan'])))
  147. td['rowspan'] = 1
  148. for i in range(1, row, 1):
  149. # 获取下一行的所有td, 在对应的位置插入
  150. if indtr+i<len(trs):
  151. tds1 = trs[indtr + i].findChildren(['td','th'], recursive=False)
  152. if len(tds1) >= (indtd) and len(tds1)>0:
  153. if indtd > 0:
  154. tds1[indtd - 1].insert_after(copy.copy(td))
  155. else:
  156. tds1[0].insert_before(copy.copy(td))
  157. elif indtd-2>0 and len(tds1) > 0 and len(tds1) == indtd - 1: # 修正某些表格最后一列没补全
  158. tds1[indtd-2].insert_after(copy.copy(td))
  159. def getTable(tbody):
  160. #trs = tbody.findChildren('tr', recursive=False)
  161. fixSpan(tbody)
  162. trs = getTrs(tbody)
  163. inner_table = []
  164. for tr in trs:
  165. tr_line = []
  166. tds = tr.findChildren(['td','th'], recursive=False)
  167. if len(tds)==0:
  168. tr_line.append([re.sub('\xa0','',tr.get_text()),0]) # 2021/12/21 修复部分表格没有td 造成数据丢失
  169. for td in tds:
  170. tr_line.append([re.sub('\xa0','',td.get_text()),0])
  171. #tr_line.append([td.get_text(),0])
  172. inner_table.append(tr_line)
  173. return inner_table
  174. class Sentence2():
  175. def __init__(self,text,sentence_index,wordOffset_begin,wordOffset_end):
  176. self.name = 'sentence2'
  177. self.text = text
  178. self.sentence_index = sentence_index
  179. self.wordOffset_begin = wordOffset_begin
  180. self.wordOffset_end = wordOffset_end
  181. def get_text(self):
  182. return self.text
  183. class ParseDocument():
  184. def __init__(self,_html,auto_merge_table=True,list_obj = []):
  185. if _html is None:
  186. _html = ""
  187. self.html = _html
  188. self.auto_merge_table = auto_merge_table
  189. if list_obj:
  190. self.list_obj = list_obj
  191. else:
  192. self.soup = BeautifulSoup(self.html, "lxml")
  193. _body = self.soup.find("body")
  194. if _body is not None:
  195. self.soup = _body
  196. self.list_obj = self.get_soup_objs(self.soup)
  197. # self.list_obj = [it.get_text().strip().replace(' ', '') for it in self.list_obj]
  198. # self.list_obj = [Sentence2(text, 1,1,5) for text in self.list_obj]
  199. # for obj in self.list_obj:
  200. # print("obj",obj.get_text()[:20])
  201. self.tree = self.buildParsetree(self.list_obj,[],auto_merge_table)
  202. # #识别目录树
  203. # if self.parseTree:
  204. # self.parseTree.printParseTree()
  205. # self.print_tree(self.tree,"-|")
  206. def get_soup_objs(self,soup,list_obj=None):
  207. if list_obj is None:
  208. list_obj = []
  209. childs = soup.find_all(recursive=False)
  210. for _obj in childs:
  211. childs1 = _obj.find_all(recursive=False)
  212. if len(childs1)==0 or len(_obj.get_text())<40 or _obj.name=="table":
  213. list_obj.append(_obj)
  214. elif _obj.name=="p":
  215. list_obj.append(_obj)
  216. else:
  217. self.get_soup_objs(_obj,list_obj)
  218. return list_obj
  219. def fix_tree(self,_product):
  220. products = extract_products(self.tree,_product)
  221. if len(products)>0:
  222. self.tree = self.buildParsetree(self.list_obj,products,self.auto_merge_table)
  223. def print_tree(self,tree,append=""):
  224. self.set_tree_id = set()
  225. if append=="":
  226. for t in tree:
  227. logger.debug("%s text:%s title:%s title_text:%s before:%s after%s product:%s"%("==>",t["text"][:50],t["sentence_title"],t["sentence_title_text"],t["title_before"],t["title_after"],t["has_product"]))
  228. for t in tree:
  229. _id = id(t)
  230. if _id in self.set_tree_id:
  231. continue
  232. self.set_tree_id.add(_id)
  233. logger.info("%s text:%s title:%s title_text:%s before:%s after%s product:%s"%(append,t["text"][:50],t["sentence_title"],t["sentence_title_text"],t["title_before"],t["title_after"],t["has_product"]))
  234. childs = t["child_title"]
  235. self.print_tree(childs,append=append+"-|")
  236. def is_title_first(self,title):
  237. if title in ("一","1","Ⅰ","a","A"):
  238. return True
  239. return False
  240. def find_title_by_pattern(self,_text,_pattern="(^|★|▲|:|:|\s+)(?P<title_1>(?P<title_1_index_0_0>第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章册包标部.::、、]+))|" \
  241. "([\s★▲\*]*)(?P<title_3>(?P<title_3_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?)(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_3_index_0_2>[、章册包标部.::、、]+))|" \
  242. "([\s★▲\*]*)(?P<title_4>(?P<title_4_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节章册部\.::、、]+))|" \
  243. "([\s★▲\*]*)(?P<title_5>(?P<title_5_index_0_0>^)(?P<title_5_index_1_1>[一二三四五六七八九十]+)(?P<title_5_index_2_0>)[^一二三四五六七八九十节章册部\.::、、])|" \
  244. "([\s★▲\*]*)(?P<title_12>(?P<title_12_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_12_index_1_1>\d{1,2})(?P<title_12_index_2_0>[\..、\s\-]?))|"\
  245. "([\s★▲\*]*)(?P<title_11>(?P<title_11_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\..、\s\-]?))|" \
  246. "([\s★▲\*]*)(?P<title_10>(?P<title_10_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\..、\s\-]?))|" \
  247. "([\s★▲\*]*)(?P<title_7>(?P<title_7_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\..包标::、\s\-]*))|" \
  248. "(^[\s★▲\*]*)(?P<title_6>(?P<title_6_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?包?)(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_2_0>[\..、\s\-包标]*))|" \
  249. "([\s★▲\*]*)(?P<title_15>(?P<title_15_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>[))包标\..::、]+))|" \
  250. "([\s★▲\*]+)(?P<title_17>(?P<title_17_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_17_index_1_1>[a-zA-Z]+)(?P<title_17_index_2_0>[))包标\..::、]+))|" \
  251. "([\s★▲\*]*)(?P<title_19>(?P<title_19_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>[))]))"
  252. ):
  253. _se = re.search(_pattern,_text)
  254. groups = []
  255. if _se is not None:
  256. e = _se.end()
  257. if re.search('(时间|日期|编号|账号|号码|手机|价格|\w价|人民币|金额|得分|分值|总分|满分|最高得|扣|减)[::]?\d', _se.group(0)) or (re.search('\d[.::]?$', _se.group(0)) and re.search('^[\d年月日万元天]', _text[e:])):
  258. return None
  259. _gd = _se.groupdict()
  260. for k,v in _gd.items():
  261. if v is not None:
  262. groups.append((k,v))
  263. if len(groups):
  264. # groups.sort(key=lambda x:x[0])
  265. return groups
  266. return None
  267. def make_increase(self,_sort,_title,_add=1):
  268. if len(_title)==0 and _add==0:
  269. return ""
  270. if len(_title)==0 and _add==1:
  271. return _sort[0]
  272. _index = _sort.index(_title[-1])
  273. next_index = (_index+_add)%len(_sort)
  274. next_chr = _sort[next_index]
  275. if _index==len(_sort)-1:
  276. _add = 1
  277. else:
  278. _add = 0
  279. return next_chr+self.make_increase(_sort,_title[:-1],_add)
  280. def get_next_title(self,_title):
  281. if re.search("^\d+$",_title) is not None:
  282. return str(int(_title)+1)
  283. if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
  284. if _title[-1]=="十":
  285. return _title+"一"
  286. if _title[-1]=="百":
  287. return _title+"零一"
  288. if _title[-1]=="九":
  289. if len(_title)==1:
  290. return "十"
  291. if len(_title)==2:
  292. if _title[0]=="十":
  293. return "二十"
  294. if len(_title)==3:
  295. if _title[0]=="九":
  296. return "一百"
  297. else:
  298. _next_title = self.make_increase(['一','二','三','四','五','六','七','八','九','十'],re.sub("[十百]",'',_title[0]))
  299. return _next_title+"十"
  300. _next_title = self.make_increase(['一','二','三','四','五','六','七','八','九','十'],re.sub("[十百]",'',_title))
  301. _next_title = list(_next_title)
  302. _next_title.reverse()
  303. if _next_title[-1]!="十":
  304. if len(_next_title)>=2:
  305. _next_title.insert(-1,'十')
  306. if len(_next_title)>=4:
  307. _next_title.insert(-3,'百')
  308. if _title[0]=="十":
  309. if _next_title=="十":
  310. _next_title = ["二","十"]
  311. _next_title.insert(0,"十")
  312. _next_title = "".join(_next_title)
  313. return _next_title
  314. if re.search("^[a-z]+$",_title) is not None:
  315. _next_title = self.make_increase([chr(i+ord('a')) for i in range(26)],_title)
  316. _next_title = list(_next_title)
  317. _next_title.reverse()
  318. return "".join(_next_title)
  319. if re.search("^[A-Z]+$",_title) is not None:
  320. _next_title = self.make_increase([chr(i+ord('A')) for i in range(26)],_title)
  321. _next_title = list(_next_title)
  322. _next_title.reverse()
  323. return "".join(_next_title)
  324. if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None:
  325. _sort = ["Ⅰ","Ⅱ","Ⅲ","Ⅳ","Ⅴ","Ⅵ","Ⅶ","Ⅷ","Ⅸ","Ⅹ","Ⅺ","Ⅻ"]
  326. _index = _sort.index(_title)
  327. if _index<len(_sort)-1:
  328. return _sort[_index+1]
  329. return None
  330. def count_title_before(self,list_obj):
  331. dict_before = {}
  332. dict_sentence_count = {}
  333. illegal_sentence = set()
  334. for obj_i in range(len(list_obj)):
  335. obj = list_obj[obj_i]
  336. _type = "sentence"
  337. _text = obj.text.strip()
  338. if obj.name=="table":
  339. _type = "table"
  340. _text = str(obj)
  341. _append = False
  342. if _type=="sentence":
  343. if len(_text)>10 and len(_text)<100:
  344. if _text not in dict_sentence_count:
  345. dict_sentence_count[_text] = 0
  346. dict_sentence_count[_text] += 1
  347. if re.search("\d+页",_text) is not None:
  348. illegal_sentence.add(_text)
  349. elif len(_text)<10:
  350. if re.search("第\d+页",_text) is not None:
  351. illegal_sentence.add(_text)
  352. sentence_groups = self.find_title_by_pattern(_text[:10])
  353. if sentence_groups:
  354. # c062f53cf83401e671822003d63c1828print("sentence_groups",sentence_groups)
  355. sentence_title = sentence_groups[0][0]
  356. sentence_title_text = sentence_groups[0][1]
  357. title_index = sentence_groups[-2][1]
  358. title_before = sentence_groups[1][1].replace("(","(").replace(":",":").replace(":",";").replace(",",".").replace(",",".").replace("、",".")
  359. title_after = sentence_groups[-1][1].replace(")",")").replace(":",":").replace(":",";").replace(",",".").replace(",",".").replace("、",".")
  360. next_index = self.get_next_title(title_index)
  361. if title_before not in dict_before:
  362. dict_before[title_before] = 0
  363. dict_before[title_before] += 1
  364. for k,v in dict_sentence_count.items():
  365. if v>10:
  366. illegal_sentence.add(k)
  367. return dict_before,illegal_sentence
  368. def is_page_no(self,sentence):
  369. if len(sentence)<10:
  370. if re.search("\d+页|^\-\d+\-$",sentence) is not None:
  371. return True
  372. def block_tree(self,childs):
  373. for child in childs:
  374. if not child["block"]:
  375. child["block"] = True
  376. childs2 = child["child_title"]
  377. self.block_tree(childs2)
  378. def buildParsetree(self,list_obj,products=[],auto_merge_table=True):
  379. self.parseTree = None
  380. trees = []
  381. list_length = []
  382. for obj in list_obj[:200]:
  383. if obj.name!="table":
  384. list_length.append(len(obj.get_text()))
  385. if len(list_length)>0:
  386. max_length = max(list_length)
  387. else:
  388. max_length = 40
  389. max_length = min(max_length,40)
  390. logger.debug("%s:%d"%("max_length",max_length))
  391. list_data = []
  392. last_table_index = None
  393. last_table_columns = None
  394. last_table = None
  395. dict_before,illegal_sentence = self.count_title_before(list_obj)
  396. for obj_i in range(len(list_obj)):
  397. obj = list_obj[obj_i]
  398. # logger.debug("==obj %s"%obj.text[:20])
  399. _type = "sentence"
  400. _text = standard_product(obj.text)
  401. if obj.name=="table":
  402. _type = "table"
  403. _text = standard_product(str(obj))
  404. _append = False
  405. sentence_title = None
  406. sentence_title_text = None
  407. sentence_groups = None
  408. title_index = None
  409. next_index = None
  410. parent_title = None
  411. title_before = None
  412. title_after = None
  413. title_next = None
  414. childs = []
  415. # new
  416. sentence_index = obj.sentence_index
  417. wordOffset_begin = obj.wordOffset_begin
  418. wordOffset_end = obj.wordOffset_end
  419. list_table = None
  420. block = False
  421. has_product = False
  422. if _type=="sentence":
  423. if _text in illegal_sentence:
  424. continue
  425. sentence_groups = self.find_title_by_pattern(_text[:10])
  426. if sentence_groups:
  427. title_before = standard_title_context(sentence_groups[1][1])
  428. title_after = sentence_groups[-1][1]
  429. sentence_title_text = sentence_groups[0][1]
  430. other_text = _text.replace(sentence_title_text,"")
  431. if (title_before in dict_before and dict_before[title_before]>1) or title_after!="":
  432. sentence_title = sentence_groups[0][0]
  433. title_index = sentence_groups[-2][1]
  434. next_index = self.get_next_title(title_index)
  435. other_text = _text.replace(sentence_title_text,"")
  436. for p in products:
  437. if other_text.strip()==p.strip():
  438. has_product = True
  439. else:
  440. _fix = False
  441. for p in products:
  442. if other_text.strip()==p.strip():
  443. title_before = "=产品"
  444. sentence_title = "title_0"
  445. sentence_title_text = p
  446. title_index = "0"
  447. title_after = "产品="
  448. next_index = "0"
  449. _fix = True
  450. has_product = True
  451. break
  452. if not _fix:
  453. title_before = None
  454. title_after = None
  455. sentence_title_text = None
  456. else:
  457. if len(_text)<40 and re.search(_param_pattern,_text) is not None:
  458. for p in products:
  459. if _text.find(p)>=0:
  460. title_before = "=产品"
  461. sentence_title = "title_0"
  462. sentence_title_text = p
  463. title_index = "0"
  464. title_after = "产品="
  465. next_index = "0"
  466. _fix = True
  467. has_product = True
  468. break
  469. if _type=="sentence":
  470. if sentence_title is None and len(list_data)>0 and list_data[-1]["sentence_title"] is not None and list_data[-1]["line_width"]>=max_length*0.6:
  471. list_data[-1]["text"] += _text
  472. list_data[-1]["line_width"] = len(_text)
  473. _append = True
  474. elif sentence_title is None and len(list_data)>0 and _type==list_data[-1]["type"]:
  475. if list_data[-1]["line_width"]>=max_length*0.7:
  476. list_data[-1]["text"] += _text
  477. list_data[-1]["line_width"] = len(_text)
  478. _append = True
  479. if _type=="table":
  480. _soup = BeautifulSoup(_text,"lxml")
  481. _table = _soup.find("table")
  482. if _table is not None:
  483. list_table = getTable(_table)
  484. if len(list_table)==0:
  485. continue
  486. table_columns = len(list_table[0])
  487. if auto_merge_table:
  488. if last_table_index is not None and abs(obj_i-last_table_index)<=2 and last_table_columns is not None and last_table_columns==table_columns:
  489. if last_table is not None:
  490. trs = getTrs(_table)
  491. last_tbody = BeautifulSoup(last_table["text"],"lxml")
  492. _table = last_tbody.find("table")
  493. last_trs = getTrs(_table)
  494. _append = True
  495. for _line in list_table:
  496. last_table["list_table"].append(_line)
  497. if len(last_trs)>0:
  498. for _tr in trs:
  499. last_trs[-1].insert_after(copy.copy(_tr))
  500. last_table["text"] = re.sub("</?html>|</?body>","",str(last_tbody))
  501. last_table_index = obj_i
  502. last_table_columns = len(list_table[-1])
  503. if not _append:
  504. _data = {"type":_type, "text":_text,"list_table":list_table,"line_width":len(_text),"sentence_title":sentence_title,"title_index":title_index,
  505. "sentence_title_text":sentence_title_text,"sentence_groups":sentence_groups,"parent_title":parent_title,
  506. "child_title":childs,"title_before":title_before,"title_after":title_after,"title_next":title_next,"next_index":next_index,
  507. "block":block,"has_product":has_product,
  508. "sentence_index":sentence_index,"wordOffset_begin":wordOffset_begin,"wordOffset_end":wordOffset_end
  509. }
  510. if _type=="table":
  511. last_table = _data
  512. last_table_index = obj_i
  513. if list_table:
  514. last_table_columns = last_table_columns = len(list_table[-1])
  515. if sentence_title is not None:
  516. if len(list_data)>0:
  517. if self.is_title_first(title_index):
  518. for i in range(1,len(list_data)+1):
  519. _d = list_data[-i]
  520. if _d["sentence_title"] is not None:
  521. _data["parent_title"] = _d
  522. _d["child_title"].append(_data)
  523. break
  524. else:
  525. _find = False
  526. for i in range(1,len(list_data)+1):
  527. if _find:
  528. break
  529. _d = list_data[-i]
  530. if _d.get("sentence_title")==sentence_title and title_before==_d["title_before"] and title_after==_d["title_after"]:
  531. if _d["next_index"]==title_index and _d["title_next"] is None and not _d["block"]:
  532. _data["parent_title"] = _d["parent_title"]
  533. _d["title_next"] = _data
  534. if len(_d["child_title"])>0:
  535. _d["child_title"][-1]["title_next"] = ""
  536. self.block_tree(_d["child_title"])
  537. if _d["parent_title"] is not None:
  538. _d["parent_title"]["child_title"].append(_data)
  539. _find = True
  540. break
  541. for i in range(1,len(list_data)+1):
  542. if _find:
  543. break
  544. _d = list_data[-i]
  545. if i==1 and not _d["block"] and _d.get("sentence_title")==sentence_title and title_before==_d["title_before"] and title_after==_d["title_after"]:
  546. _data["parent_title"] = _d["parent_title"]
  547. _d["title_next"] = _data
  548. if len(_d["child_title"])>0:
  549. _d["child_title"][-1]["title_next"] = ""
  550. self.block_tree(_d["child_title"])
  551. if _d["parent_title"] is not None:
  552. _d["parent_title"]["child_title"].append(_data)
  553. _find = True
  554. break
  555. title_before = standard_title_context(title_before)
  556. title_after = standard_title_context(title_after)
  557. for i in range(1,len(list_data)+1):
  558. if _find:
  559. break
  560. _d = list_data[-i]
  561. if _d.get("sentence_title")==sentence_title and title_before==standard_title_context(_d["title_before"]) and title_after==standard_title_context(_d["title_after"]):
  562. if _d["next_index"]==title_index and _d["title_next"] is None and not _d["block"]:
  563. _data["parent_title"] = _d["parent_title"]
  564. _d["title_next"] = _data
  565. if len(_d["child_title"])>0:
  566. _d["child_title"][-1]["title_next"] = ""
  567. self.block_tree(_d["child_title"])
  568. if _d["parent_title"] is not None:
  569. _d["parent_title"]["child_title"].append(_data)
  570. _find = True
  571. break
  572. for i in range(1,len(list_data)+1):
  573. if _find:
  574. break
  575. _d = list_data[-i]
  576. if not _d["block"] and _d.get("sentence_title")==sentence_title and title_before==standard_title_context(_d["title_before"]) and title_after==standard_title_context(_d["title_after"]):
  577. _data["parent_title"] = _d["parent_title"]
  578. _d["title_next"] = _data
  579. if len(_d["child_title"])>0:
  580. _d["child_title"][-1]["title_next"] = ""
  581. # self.block_tree(_d["child_title"])
  582. if _d["parent_title"] is not None:
  583. _d["parent_title"]["child_title"].append(_data)
  584. _find = True
  585. break
  586. for i in range(1,min(len(list_data)+1,20)):
  587. if _find:
  588. break
  589. _d = list_data[-i]
  590. if not _d["block"] and _d.get("sentence_title")==sentence_title and title_before==standard_title_context(_d["title_before"]):
  591. _data["parent_title"] = _d["parent_title"]
  592. _d["title_next"] = _data
  593. if len(_d["child_title"])>0:
  594. _d["child_title"][-1]["title_next"] = ""
  595. # self.block_tree(_d["child_title"])
  596. if _d["parent_title"] is not None:
  597. _d["parent_title"]["child_title"].append(_data)
  598. _find = True
  599. break
  600. if not _find:
  601. if len(list_data)>0:
  602. for i in range(1,len(list_data)+1):
  603. _d = list_data[-i]
  604. if _d.get("sentence_title") is not None:
  605. _data["parent_title"] = _d
  606. _d["child_title"].append(_data)
  607. break
  608. else:
  609. if len(list_data)>0:
  610. for i in range(1,len(list_data)+1):
  611. _d = list_data[-i]
  612. if _d.get("sentence_title") is not None:
  613. _data["parent_title"] = _d
  614. _d["child_title"].append(_data)
  615. break
  616. list_data.append(_data)
  617. for _data in list_data:
  618. childs = _data["child_title"]
  619. for c_i in range(len(childs)):
  620. cdata = childs[c_i]
  621. if cdata["has_product"]:
  622. continue
  623. else:
  624. if c_i>0:
  625. last_cdata = childs[c_i-1]
  626. if cdata["sentence_title"] is not None and last_cdata["sentence_title"] is not None and last_cdata["title_before"]==cdata["title_before"] and last_cdata["title_after"]==cdata["title_after"] and last_cdata["has_product"]:
  627. cdata["has_product"] = True
  628. if c_i<len(childs)-1:
  629. last_cdata = childs[c_i+1]
  630. if cdata["sentence_title"] is not None and last_cdata["sentence_title"] is not None and last_cdata["title_before"]==cdata["title_before"] and last_cdata["title_after"]==cdata["title_after"] and last_cdata["has_product"]:
  631. cdata["has_product"] = True
  632. for c_i in range(len(childs)):
  633. cdata = childs[len(childs)-1-c_i]
  634. if cdata["has_product"]:
  635. continue
  636. else:
  637. if c_i>0:
  638. last_cdata = childs[c_i-1]
  639. if cdata["sentence_title"] is not None and last_cdata["sentence_title"] is not None and last_cdata["title_before"]==cdata["title_before"] and last_cdata["title_after"]==cdata["title_after"] and last_cdata["has_product"]:
  640. cdata["has_product"] = True
  641. if c_i<len(childs)-1:
  642. last_cdata = childs[c_i+1]
  643. if cdata["sentence_title"] is not None and last_cdata["sentence_title"] is not None and last_cdata["title_before"]==cdata["title_before"] and last_cdata["title_after"]==cdata["title_after"] and last_cdata["has_product"]:
  644. cdata["has_product"] = True
  645. return list_data
  646. def standard_title_context(_title_context):
  647. return _title_context.replace("(","(").replace(")",")").replace(":",":").replace(":",";").replace(",",".").replace(",",".").replace("、",".").replace(".",".")
  648. def standard_product(sentence):
  649. return sentence.replace("(","(").replace(")",")")
  650. def extract_products(list_data,_product,_param_pattern = "产品名称|设备材料|采购内存|标的名称|采购内容|(标的|维修|系统|报价构成|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|配件|资产|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名|气体|标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|中标|成交|工程|招标内容)[\))的]?([、\w]{,4}名称|内容|描述)|标的|标项|项目$|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品|物装|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|菜名|^品目$|^品名$|^名称|^内容$"):
  651. _product = standard_product(_product)
  652. list_result = []
  653. list_table_products = []
  654. for _data_i in range(len(list_data)):
  655. _data = list_data[_data_i]
  656. _type = _data["type"]
  657. _text = _data["text"]
  658. if _type=="table":
  659. list_table = _data["list_table"]
  660. if list_table is None:
  661. continue
  662. _check = True
  663. max_length = max([len(a) for a in list_table])
  664. min_length = min([len(a) for a in list_table])
  665. if min_length<max_length/2:
  666. continue
  667. list_head_index = []
  668. _begin_index = 0
  669. head_cell_text = ""
  670. for line_i in range(len(list_table[:2])):
  671. line = list_table[line_i]
  672. line_text = ",".join([cell[0] for cell in line])
  673. for cell_i in range(len(line)):
  674. cell = line[cell_i]
  675. cell_text = cell[0]
  676. if len(cell_text)<10 and re.search(_param_pattern,cell_text) is not None and re.search("单价|数量|预算|限价|总价|品牌|规格|型号|用途|要求|采购量",line_text) is not None:
  677. _begin_index = line_i+1
  678. list_head_index.append(cell_i)
  679. for line_i in range(len(list_table)):
  680. line = list_table[line_i]
  681. for cell_i in list_head_index:
  682. if cell_i>=len(line):
  683. continue
  684. cell = line[cell_i]
  685. cell_text = cell[0]
  686. head_cell_text += cell_text
  687. # print("===head_cell_text",head_cell_text)
  688. if re.search("招标人|采购人|项目编号|项目名称|金额|^\d+$",head_cell_text) is not None:
  689. list_head_index = []
  690. for line in list_table:
  691. line_text = ",".join([cell[0] for cell in line])
  692. for cell_i in range(len(line)):
  693. cell = line[cell_i]
  694. cell_text = cell[0]
  695. if cell_text is not None and _product is not None and len(cell_text)<len(_product)*10 and cell_text.find(_product)>=0 and re.search("单价|数量|总价|规格|品牌|型号|用途|要求|采购量",line_text) is not None:
  696. list_head_index.append(cell_i)
  697. list_head_index = list(set(list_head_index))
  698. if len(list_head_index)>0:
  699. has_number = False
  700. for cell_i in list_head_index:
  701. table_products = []
  702. for line_i in range(_begin_index,len(list_table)):
  703. line = list_table[line_i]
  704. for _i in range(len(line)):
  705. cell = line[_i]
  706. cell_text = cell[0]
  707. if re.search("^\d+$",cell_text) is not None:
  708. has_number = True
  709. if cell_i>=len(line):
  710. continue
  711. cell = line[cell_i]
  712. cell_text = cell[0]
  713. if re.search(_param_pattern,cell_text) is None or has_number:
  714. if re.search("^[\da-zA-Z]+$",cell_text) is None:
  715. table_products.append(cell_text)
  716. if len(table_products)>0:
  717. logger.debug("table products %s"%(str(table_products)))
  718. if min([len(x) for x in table_products])>0 and max([len(x) for x in table_products])<=30:
  719. if re.search("招标人|代理人|预算|数量|交货期|品牌|产地","".join(table_products)) is None:
  720. list_table_products.append(table_products)
  721. _find = False
  722. for table_products in list_table_products:
  723. for _p in table_products:
  724. if is_similar(_product,_p,90):
  725. _find = True
  726. logger.debug("similar table_products %s"%(str(table_products)))
  727. list_result = list(set([a for a in table_products if len(a)>1 and len(a)<20 and re.search("费用|预算|合计|金额|万元|运费|^其他$",a) is None]))
  728. break
  729. if not _find:
  730. for table_products in list_table_products:
  731. list_result.extend(table_products)
  732. list_result = list(set([a for a in list_result if len(a)>1 and len(a)<30 and re.search("费用|预算|合计|金额|万元|运费",a) is None]))
  733. return list_result
  734. def get_childs(childs, max_depth=None):
  735. list_data = []
  736. for _child in childs:
  737. list_data.append(_child)
  738. childs2 = _child.get("child_title",[])
  739. if len(childs2)>0 and (max_depth==None or max_depth>0):
  740. for _child2 in childs2:
  741. if max_depth != None:
  742. list_data.extend(get_childs([_child2], max_depth-1))
  743. else:
  744. list_data.extend(get_childs([_child2], None))
  745. return list_data
  746. def get_range_data_by_childs(list_data,childs):
  747. range_data = []
  748. list_child = get_childs(childs)
  749. list_index = []
  750. set_child = set([id(x) for x in list_child])
  751. for _data_i in range(len(list_data)):
  752. _data = list_data[_data_i]
  753. _id = id(_data)
  754. if _id in set_child:
  755. list_index.append(_data_i)
  756. if len(list_index)>0:
  757. range_data = list_data[min(list_index):max(list_index)+1]
  758. return range_data
  759. def get_correct_product(product,products):
  760. list_data = []
  761. for p in products:
  762. is_sim = is_similar(product,p)
  763. _d = {"product":p,"distance":abs(len(product)-len(p)),"is_sim":is_sim}
  764. list_data.append(_d)
  765. list_data.sort(key=lambda x:x["distance"])
  766. for _d in list_data:
  767. is_sim = _d["is_sim"]
  768. if is_sim:
  769. if len(_d["product"])>len(product) and _d["product"].find(product)>=0:
  770. return product
  771. return _d["product"]
  772. return product
  773. def get_childs_text(childs,_product,products,is_begin=False,is_end=False):
  774. _text = ""
  775. end_next = False
  776. for _child in childs:
  777. child_text = _child.get("text")
  778. if child_text.find(_product)>=0:
  779. if not is_begin:
  780. is_begin = True
  781. if not end_next:
  782. if _child["sentence_title"] is not None and isinstance(_child["title_next"],dict) and _child["title_next"]["sentence_title"] is not None:
  783. end_next = True
  784. end_title = _child["title_next"]
  785. logger.debug("end_title %s "%end_title["text"])
  786. logger.debug("%s-%s-%s"%("get_childs_text",child_text[:10],str(is_begin)))
  787. for p in products:
  788. if child_text.find(p)>=0 and is_similar(_product,p,90):
  789. is_begin = True
  790. if child_text.find(_product)<0 and not is_similar(_product,p,80) and (child_text.find(p)>=0 or _child["has_product"]):
  791. if is_begin:
  792. is_end = True
  793. logger.debug("%s-%s-%s"%("get_childs_text end",child_text[:10],p))
  794. break
  795. if re.search(end_pattern,child_text) is not None:
  796. if is_begin:
  797. is_end = True
  798. logger.debug("%s-%s-%s"%("get_childs_text end",child_text[:10],str(is_end)))
  799. if is_begin and is_end:
  800. break
  801. if is_begin:
  802. _text += _child.get("text")+"\r\n"
  803. childs2 = _child.get("child_title",[])
  804. if len(childs2)>0:
  805. for _child2 in childs2:
  806. child_text,is_begin,is_end = get_childs_text([_child2],_product,products,is_begin)
  807. if is_begin:
  808. _text += child_text
  809. if is_end:
  810. break
  811. if end_next:
  812. is_end = True
  813. # logger.debug("%s-%s-%s"%("get_childs_text1",_text,str(is_begin)))
  814. # logger.debug("%s-%s-%s"%("get_childs_text2",_text,str(is_begin)))
  815. return _text,is_begin,is_end
  816. def extract_parameters_by_tree(_product,products,list_data,_data_i,parent_title,list_result,):
  817. _data = list_data[_data_i]
  818. childs = _data.get("child_title",[])
  819. if len(childs)>0:
  820. child_text,_,_ = get_childs_text([_data],_product,products)
  821. if len(child_text)>0:
  822. logger.info("extract_type by_tree child_text:%s"%child_text)
  823. list_result.append(child_text)
  824. if parent_title is not None:
  825. child_text,_,_ = get_childs_text([parent_title],_product,products)
  826. if len(child_text)>0:
  827. logger.info("extract_type by_tree child_text:%s"%child_text)
  828. list_result.append(child_text)
  829. childs = parent_title.get("child_title",[])
  830. if len(childs)>0:
  831. range_data = get_range_data_by_childs(list_data[_data_i:],childs)
  832. p_text = ""
  833. _find = False
  834. end_id = id(_data["title_next"]) if isinstance(_data["sentence_title"],dict) and _data["title_next"] is not None and _data["title_next"]["sentence_title"] is not None else None
  835. for pdata in range_data:
  836. ptext = pdata["text"]
  837. for p in products:
  838. if ptext.find(_product)<0 and (ptext.find(p)>=0 or pdata["has_product"]):
  839. _find = True
  840. break
  841. if re.search(end_pattern,ptext) is not None:
  842. _find = True
  843. if _find:
  844. break
  845. if id(pdata)==end_id:
  846. break
  847. p_text += ptext+"\r\n"
  848. if len(p_text)>0:
  849. logger.debug("extract_type by parent range_text:%s"%p_text)
  850. list_result.append(p_text)
  851. return True
  852. return False
  853. def get_table_pieces(_text,_product,products,list_result,_find):
  854. _soup = BeautifulSoup(_text,"lxml")
  855. _table = _soup.find("table")
  856. if _table is not None:
  857. trs = getTrs(_table)
  858. list_trs = []
  859. for tr in trs:
  860. tr_text = tr.get_text()
  861. if tr_text.find(_product)>=0:
  862. _find = True
  863. logger.debug("%s-%s"%("table_html_tr",tr_text))
  864. for p in products:
  865. if _find and p!=_product and tr_text.find(p)>=0:
  866. _find = False
  867. break
  868. if re.search(end_pattern,tr_text) is not None:
  869. _find = False
  870. break
  871. if _find:
  872. list_trs.append(tr)
  873. if len(list_trs)>0:
  874. table_html = "<table>%s</table>"%("\r\n".join([str(a) for a in list_trs]))
  875. logger.debug("extract_type table slices %s"%(table_html))
  876. list_result.append(table_html)
  877. def extract_parameters_by_table(_product,products,_param_pattern,list_data,_data_i,list_result):
  878. _data = list_data[_data_i]
  879. _text = _data["text"]
  880. list_table = _data["list_table"]
  881. parent_title = _data["parent_title"]
  882. if list_table is not None:
  883. _check = True
  884. max_length = max([len(a) for a in list_table])
  885. min_length = min([len(a) for a in list_table])
  886. text_line_first = ",".join(a[0] for a in list_table[0])
  887. if max_length>10:
  888. if min_length<max_length/2:
  889. return
  890. last_data = list_data[_data_i-1]
  891. _flag = False
  892. if last_data["type"]=="sentence" and last_data["text"].find(_product)>=0:
  893. logger.debug("last sentence find product %s-%s"%(_product,last_data["text"]))
  894. _flag = True
  895. # print(text_line_first,"text_line_first",re.search(_param_pattern,text_line_first) is not None and text_line_first.find(_product)>=0)
  896. if re.search(_param_pattern,text_line_first) is not None and text_line_first.find(_product)>=0:
  897. _flag = True
  898. if _flag:
  899. if len(products)==0:
  900. logger.debug("extract_type whole table by param and product %s"%(_text))
  901. list_result.append(_text)
  902. else:
  903. for p in products:
  904. if p!=_product and _text.find(p)>=0:
  905. logger.debug("extract_type add all table failed %s-%s"%(_product,p))
  906. _flag = False
  907. break
  908. if _flag:
  909. logger.debug("extract_type add all table succeed")
  910. get_table_pieces(_text,_product,products,list_result,True)
  911. else:
  912. list_head_index = []
  913. for line in list_table[:2]:
  914. for cell_i in range(len(line)):
  915. cell = line[cell_i]
  916. cell_text = cell[0]
  917. if len(cell_text)<20 and re.search(_param_pattern,cell_text) is not None:
  918. list_head_index.append(cell_i)
  919. list_head_index = list(set(list_head_index))
  920. for line in list_table:
  921. for cell in line:
  922. cell_text = cell[0]
  923. if len(cell_text)>50 and len(re.findall(meter_pattern,cell_text))>5 and cell_text.find(_product)>=0:
  924. _f = True
  925. for cell in line:
  926. if not _f:
  927. break
  928. cell_text = cell[0]
  929. for p in products:
  930. if cell_text.find(p)>=0 and p!=_product:
  931. _f = False
  932. break
  933. if _f:
  934. logger.debug("extract_type param column %s"%(cell_text))
  935. list_result.append(cell_text)
  936. if len(cell_text)<len(_product)*10 and str(cell_text).find(_product)>=0:
  937. for _index in list_head_index:
  938. if _index>=len(line):
  939. continue
  940. _cell = line[_index]
  941. if len(cell[0])>0:
  942. logger.info("%s-%s"%("extract_type add on table text:",_cell[0]))
  943. list_result.append(_cell[0])
  944. if not _flag and (re.search(_param_pattern,_text) is not None or (parent_title is not None and re.search(_param_pattern,parent_title["text"]) is not None)) and _text.find(_product)>=0:
  945. get_table_pieces(_text,_product,products,list_result,False)
  946. def extract_parameters_by_sentence(list_data,_data,_data_i,_product,products,list_result,is_project):
  947. _text = _data["text"]
  948. if _text.find(_product)>=0:
  949. parent_title = _data.get("parent_title")
  950. parent_text = ""
  951. parent_parent_title = None
  952. parent_parent_text = ""
  953. parent_title_index = None
  954. parent_parent_title_index = None
  955. childs = get_childs([_data])
  956. child_find = False
  957. for c in childs:
  958. if re.search(_param_pattern,c["text"]) is not None and len(c["text"])<30:
  959. logger.debug("child text %s"%(c["text"]))
  960. child_find = True
  961. break
  962. extract_text,_,_ = get_childs_text([_data],_product,products)
  963. logger.debug("childs found extract_text %s %s"%(str(child_find),extract_text))
  964. if child_find:
  965. if len(extract_text)>0:
  966. list_result.append(extract_text)
  967. else:
  968. limit_nums = len(_product)*2+5
  969. if len(_product)<=3:
  970. limit_nums += 6
  971. if _text.find("数量")>=0:
  972. limit_nums += 6
  973. if len(_text)<=limit_nums and _data["sentence_title"] is not None:
  974. if re.search(meter_pattern,extract_text) is not None:
  975. list_result.append(extract_text)
  976. elif len(re.findall(meter_pattern,extract_text))>2:
  977. list_result.append(extract_text)
  978. if parent_title is not None:
  979. parent_text = parent_title.get("text","")
  980. parent_parent_title = parent_title.get("parent_title")
  981. parent_title_index = parent_title["title_index"]
  982. if parent_parent_title is not None:
  983. parent_parent_text = parent_parent_title.get("text","")
  984. parent_parent_title_index = parent_parent_title["title_index"]
  985. _suit = False
  986. if re.search(_param_pattern,_text) is not None and len(_text)<50:
  987. _suit = True
  988. if re.search(_param_pattern,parent_text) is not None and len(parent_text)<50:
  989. _suit = True
  990. if re.search(_param_pattern,parent_parent_text) is not None and len(parent_parent_text)<50:
  991. _suit = True
  992. if _suit:
  993. logger.debug("extract_type sentence %s"%("extract_parameters_by_tree"))
  994. if not extract_parameters_by_tree(_product,products,list_data,_data_i,parent_title,list_result):
  995. logger.debug("extract_type sentence %s"%("extract_parameters_by_tree"))
  996. extract_parameters_by_tree(_product,products,list_data,_data_i,parent_parent_title,list_result)
  997. if re.search(_param_pattern,_text) is not None and len(_text)<50:
  998. childs = _data["child_title"]
  999. if len(childs)>0:
  1000. extract_text,_,_ = get_childs_text([_data],_product,products)
  1001. if len(extract_text)>0:
  1002. logger.debug("extract_type param-product %s"%(extract_text))
  1003. list_result.append(extract_text)
  1004. elif is_project:
  1005. extract_text,_,_ = get_childs_text([_data],_product,products,is_begin=True)
  1006. if len(extract_text)>0 and re.search(meter_pattern,extract_text) is not None:
  1007. logger.debug("extract_type sentence is_project param-product is product %s"%(extract_text))
  1008. list_result.append(extract_text)
  1009. def getBestProductText(list_result,_product,products):
  1010. list_result.sort(key=lambda x:len(re.findall(meter_pattern+"|"+'[::;;]|\d+[%A-Za-z]+',BeautifulSoup(x,"lxml").get_text())), reverse=True)
  1011. logger.debug("+++++++++++++++++++++")
  1012. for i in range(len(list_result)):
  1013. logger.debug("result%d %s"%(i,list_result[i]))
  1014. logger.debug("+++++++++++++++++++++")
  1015. for i in range(len(list_result)):
  1016. _result = list_result[i]
  1017. _check = True
  1018. _result_text = BeautifulSoup(_result,"lxml").get_text()
  1019. _search = re.search("项目编号[::]|项目名称[::]|联合体投标|开户银行",_result)
  1020. if _search is not None:
  1021. logger.debug("result%d error illegal text %s"%(i,str(_search)))
  1022. _check = False
  1023. if not (len(_result_text)<1000 and _result[:6]!="<table"):
  1024. for p in products:
  1025. if _result_text.find(p)>0 and not (is_similar(_product,p,80) or p.find(_product)>=0 or _product.find(p)>=0):
  1026. logger.debug("result%d error product scoss %s"%(i,p))
  1027. _check = False
  1028. if len(_result_text)<100:
  1029. if re.search(meter_pattern,_result_text) is None:
  1030. logger.debug("result%d error text min count"%(i))
  1031. _check = False
  1032. if len(_result_text)>5000:
  1033. if len(_result_text)>10000:
  1034. logger.debug("result%d error text max count"%(i))
  1035. _check = False
  1036. elif len(re.findall(meter_pattern,_result_text))<10:
  1037. logger.debug("result%d error text max count less meter"%(i))
  1038. _check = False
  1039. list_find = list(set(re.findall(meter_pattern,_result_text)))
  1040. not_list_find = list(set(re.findall(not_meter_pattern,_result_text)))
  1041. _count = len(list_find)-len(not_list_find)
  1042. has_num = False
  1043. for _find in list_find:
  1044. if re.search('[0-9a-zA-Z]',_find) is not None:
  1045. has_num = True
  1046. break
  1047. if not(_count>=2 and has_num or _count>=5):
  1048. logger.debug("result%d error match not enough"%(i))
  1049. _check = False
  1050. if _check:
  1051. return _result
  1052. def format_text(_result):
  1053. list_result = re.split("\r|\n",_result)
  1054. _result = ""
  1055. for _r in list_result:
  1056. if len(_r)>0:
  1057. _result+="%s\n"%(_r)
  1058. _result = '<div style="white-space:pre">%s</div>'%(_result)
  1059. return _result
  1060. def extract_product_parameters(list_data,_product):
  1061. list_result = []
  1062. _product = standard_product(_product.strip())
  1063. products = extract_products(list_data,_product)
  1064. _product = get_correct_product(_product,products)
  1065. logger.debug("all products %s-%s"%(_product,str(products)))
  1066. is_project = False
  1067. if re.search("项目名称|采购项目",_product) is not None:
  1068. is_project = True
  1069. if len(products)==1 and is_similar(products[0],_product,90):
  1070. is_project = True
  1071. _find_count = 0
  1072. for _data_i in range(len(list_data)):
  1073. _data = list_data[_data_i]
  1074. _type = _data["type"]
  1075. _text = _data["text"]
  1076. if _type=="sentence":
  1077. if _text.find(_product)>=0:
  1078. _find_count += 1
  1079. if re.search("项目名称|采购项目",_text) is not None and re.search("等",_text) is not None:
  1080. is_project = True
  1081. extract_parameters_by_sentence(list_data,_data,_data_i,_product,products,list_result,is_project)
  1082. elif _type=="table":
  1083. if _text.find(_product)>=0:
  1084. _find_count += 1
  1085. extract_parameters_by_table(_product,products,_param_pattern,list_data,_data_i,list_result)
  1086. _text = getBestProductText(list_result,_product,products)
  1087. return _text,_find_count
  1088. if __name__ == '__main__':
  1089. filepath = "download/4597dcc128bfabc7584d10590ae50656.html"
  1090. _product = "彩色多普勒超声诊断仪"
  1091. _html = open(filepath, "r", encoding="utf8").read()
  1092. pd = ParseDocument(_html,False)
  1093. pd.fix_tree(_product)
  1094. list_data = pd.tree
  1095. pd.print_tree(list_data)
  1096. _text,_count = extract_product_parameters(list_data,_product)
  1097. logger.info("find count:%d"%(_count))
  1098. logger.info("extract_parameter_text::%s"%(_text))