htmlparser.py 59 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244
  1. #coding:utf8
  2. import re
  3. # from BaseDataMaintenance.maintenance.product.productUtils import is_similar
  4. # from BiddingKG.dl.common.Utils import log
  5. import logging
  6. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  7. logger = logging.getLogger(__name__)
  8. logger.setLevel(logging.INFO)
  9. def log(msg):
  10. '''
  11. @summary:打印信息
  12. '''
  13. logger.info(msg)
  14. from bs4 import BeautifulSoup
  15. import copy
  16. import Levenshtein
  17. def jaccard_score(source,target):
  18. source_set = set([s for s in source])
  19. target_set = set([s for s in target])
  20. if len(source_set)==0 or len(target_set)==0:
  21. return 0
  22. return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
  23. def judge_pur_chinese(keyword):
  24. """
  25. 中文字符的编码范围为: u'\u4e00' -- u'\u9fff:只要在此范围内就可以判断为中文字符串
  26. @param keyword:
  27. @return:
  28. """
  29. # 定义一个需要删除的标点符号字符串列表
  30. remove_chars = '[·’!"\#$%&\'()#!()*+,-./:;<=>?\@,:?¥★、….>【】[]《》?“”‘’\[\\]^_`{|}~]+'
  31. # 利用re.sub来删除中文字符串中的标点符号
  32. strings = re.sub(remove_chars, "", keyword) # 将keyword中文字符串中remove_chars中包含的标点符号替换为空字符串
  33. for ch in strings:
  34. if u'\u4e00' <= ch <= u'\u9fff':
  35. pass
  36. else:
  37. return False
  38. return True
  39. def is_similar(source,target,_radio=None):
  40. source = str(source).lower()
  41. target = str(target).lower()
  42. max_len = max(len(source),len(target))
  43. min_len = min(len(source),len(target))
  44. min_ratio = 90
  45. if min_len>=3:
  46. min_ratio = 87
  47. if min_len>=5:
  48. min_ratio = 85
  49. if _radio is not None:
  50. min_ratio = _radio
  51. # dis_len = abs(len(source)-len(target))
  52. # min_dis = min(max_len*0.2,4)
  53. if min_len==0 and max_len>0:
  54. return False
  55. if max_len<=2:
  56. if source==target:
  57. return True
  58. if min_len<2:
  59. return False
  60. #判断相似度
  61. similar = Levenshtein.ratio(source,target)*100
  62. if similar>=min_ratio:
  63. log("%s and %s similar_jaro %d"%(source,target,similar))
  64. return True
  65. similar_jaro = Levenshtein.jaro(source,target)
  66. if similar_jaro*100>=min_ratio:
  67. log("%s and %s similar_jaro %d"%(source,target,similar_jaro*100))
  68. return True
  69. similar_jarow = Levenshtein.jaro_winkler(source,target)
  70. if similar_jarow*100>=min_ratio:
  71. log("%s and %s similar_jaro %d"%(source,target,similar_jarow*100))
  72. return True
  73. if min_len>=5:
  74. if len(source)==max_len and str(source).find(target)>=0:
  75. return True
  76. elif len(target)==max_len and target.find(source)>=0:
  77. return True
  78. elif jaccard_score(source, target)==1 and judge_pur_chinese(source) and judge_pur_chinese(target):
  79. return True
  80. return False
  81. end_pattern = "商务要求|评分标准|商务条件|商务条件"
  82. _param_pattern = "(产品|技术|清单|配置|参数|具体|明细|项目|招标|货物|服务|规格|工作|具体)[及和与]?(指标|配置|条件|要求|参数|需求|规格|条款|名称及要求)|配置清单|(质量|技术).{,10}要求|验收标准|^(参数|功能)$"
  83. meter_pattern = "[><≤≥±]\d+|\d+(?:[μucmkK微毫千]?[米升LlgGmMΩ]|摄氏度|英寸|度|天|VA|dB|bpm|rpm|kPa|mol|cmH20|%|°|Mpa|Hz|K?HZ|℃|W|min|[*×xX])|[*×xX]\d+|/min|\ds[^a-zA-Z]|GB.{,20}标准|PVC|PP|角度|容积|色彩|自动|流量|外径|轴位|折射率|帧率|柱镜|振幅|磁场|镜片|防漏|强度|允差|心率|倍数|瞳距|底座|色泽|噪音|间距|材质|材料|表面|频率|阻抗|浓度|兼容|防尘|防水|内径|实时|一次性|误差|性能|距离|精确|温度|超温|范围|跟踪|对比度|亮度|[横纵]向|均压|负压|正压|可调|设定值|功能|检测|高度|厚度|宽度|深度|[单双多]通道|效果|指数|模式|尺寸|重量|峰值|谷值|容量|寿命|稳定性|高温|信号|电源|电流|转换率|效率|释放量|转速|离心力|向心力|弯曲|电压|功率|气量|国标|标准协议|灵敏度|最大值|最小值|耐磨|波形|高压|性强|工艺|光源|低压|压力|压强|速度|湿度|重量|毛重|[MLX大中小]+码|净重|颜色|[红橙黄绿青蓝紫]色|不锈钢|输入|输出|噪声|认证|配置"
  84. not_meter_pattern = "投标报价|中标金额|商务部分|公章|分值构成|业绩|详见|联系人|联系电话|合同价|金额|采购预算|资金来源|费用|质疑|评审因素|评审标准|商务资信|商务评分|专家论证意见|评标方法|代理服务费|售后服务|评分类型|评分项目|预算金额|得\d+分|项目金额|详见招标文件|乙方"
  85. def getTrs(tbody):
  86. #获取所有的tr
  87. trs = []
  88. if tbody.name=="table":
  89. body = tbody.find("tbody",recursive=False)
  90. if body is not None:
  91. tbody = body
  92. objs = tbody.find_all(recursive=False)
  93. for obj in objs:
  94. if obj.name=="tr":
  95. trs.append(obj)
  96. if obj.name=="tbody" or obj.name=="table":
  97. for tr in obj.find_all("tr",recursive=False):
  98. trs.append(tr)
  99. return trs
  100. def fixSpan(tbody):
  101. # 处理colspan, rowspan信息补全问题
  102. #trs = tbody.findChildren('tr', recursive=False)
  103. trs = getTrs(tbody)
  104. ths_len = 0
  105. ths = list()
  106. trs_set = set()
  107. #修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱
  108. # 遍历每一个tr
  109. for indtr, tr in enumerate(trs):
  110. ths_tmp = tr.findChildren('th', recursive=False)
  111. #不补全含有表格的tr
  112. if len(tr.findChildren('table'))>0:
  113. continue
  114. if len(ths_tmp) > 0:
  115. ths_len = ths_len + len(ths_tmp)
  116. for th in ths_tmp:
  117. ths.append(th)
  118. trs_set.add(tr)
  119. # 遍历每行中的element
  120. tds = tr.findChildren(recursive=False)
  121. for indtd, td in enumerate(tds):
  122. # 若有colspan 则补全同一行下一个位置
  123. if 'colspan' in td.attrs:
  124. if str(re.sub("[^0-9]","",str(td['colspan'])))!="":
  125. col = int(re.sub("[^0-9]","",str(td['colspan'])))
  126. if col<100 and len(td.get_text())<1000:
  127. td['colspan'] = 1
  128. for i in range(1, col, 1):
  129. td.insert_after(copy.copy(td))
  130. for indtr, tr in enumerate(trs):
  131. ths_tmp = tr.findChildren('th', recursive=False)
  132. #不补全含有表格的tr
  133. if len(tr.findChildren('table'))>0:
  134. continue
  135. if len(ths_tmp) > 0:
  136. ths_len = ths_len + len(ths_tmp)
  137. for th in ths_tmp:
  138. ths.append(th)
  139. trs_set.add(tr)
  140. # 遍历每行中的element
  141. tds = tr.findChildren(recursive=False)
  142. for indtd, td in enumerate(tds):
  143. # 若有rowspan 则补全下一行同样位置
  144. if 'rowspan' in td.attrs:
  145. if str(re.sub("[^0-9]","",str(td['rowspan'])))!="":
  146. row = int(re.sub("[^0-9]","",str(td['rowspan'])))
  147. td['rowspan'] = 1
  148. for i in range(1, row, 1):
  149. # 获取下一行的所有td, 在对应的位置插入
  150. if indtr+i<len(trs):
  151. tds1 = trs[indtr + i].findChildren(['td','th'], recursive=False)
  152. if len(tds1) >= (indtd) and len(tds1)>0:
  153. if indtd > 0:
  154. tds1[indtd - 1].insert_after(copy.copy(td))
  155. else:
  156. tds1[0].insert_before(copy.copy(td))
  157. elif indtd-2>0 and len(tds1) > 0 and len(tds1) == indtd - 1: # 修正某些表格最后一列没补全
  158. tds1[indtd-2].insert_after(copy.copy(td))
  159. def getTable(tbody):
  160. #trs = tbody.findChildren('tr', recursive=False)
  161. fixSpan(tbody)
  162. trs = getTrs(tbody)
  163. inner_table = []
  164. for tr in trs:
  165. tr_line = []
  166. tds = tr.findChildren(['td','th'], recursive=False)
  167. if len(tds)==0:
  168. tr_line.append([re.sub('\xa0','',tr.get_text()),0]) # 2021/12/21 修复部分表格没有td 造成数据丢失
  169. for td in tds:
  170. tr_line.append([re.sub('\xa0','',td.get_text()),0])
  171. #tr_line.append([td.get_text(),0])
  172. inner_table.append(tr_line)
  173. return inner_table
  174. class Sentence2():
  175. def __init__(self,text,sentence_index,wordOffset_begin,wordOffset_end):
  176. self.name = 'sentence2'
  177. self.text = text
  178. self.sentence_index = sentence_index
  179. self.wordOffset_begin = wordOffset_begin
  180. self.wordOffset_end = wordOffset_end
  181. def get_text(self):
  182. return self.text
  183. class ParseDocument():
  184. def __init__(self,_html,auto_merge_table=True,list_obj = []):
  185. if _html is None:
  186. _html = ""
  187. self.html = _html
  188. # self.soup = BeautifulSoup(self.html,"lxml")
  189. # self.soup = BeautifulSoup(self.html,"html.parser")
  190. self.auto_merge_table = auto_merge_table
  191. if list_obj:
  192. self.list_obj = list_obj
  193. else:
  194. self.soup = BeautifulSoup(self.html, "lxml")
  195. _body = self.soup.find("body")
  196. if _body is not None:
  197. self.soup = _body
  198. self.list_obj = self.get_soup_objs(self.soup)
  199. # self.list_obj = [it.get_text().strip().replace(' ', '') for it in self.list_obj]
  200. # self.list_obj = [Sentence2(text, 1,1,5) for text in self.list_obj]
  201. # for obj in self.list_obj:
  202. # print("obj",obj.get_text()[:20])
  203. self.tree = self.buildParsetree(self.list_obj,[],auto_merge_table)
  204. # #识别目录树
  205. # if self.parseTree:
  206. # self.parseTree.printParseTree()
  207. # self.print_tree(self.tree,"-|")
  208. def get_soup_objs(self,soup,list_obj=None):
  209. if list_obj is None:
  210. list_obj = []
  211. childs = soup.find_all(recursive=False)
  212. for _obj in childs:
  213. childs1 = _obj.find_all(recursive=False)
  214. if len(childs1)==0 or len(_obj.get_text())<40 or _obj.name=="table":
  215. list_obj.append(_obj)
  216. elif _obj.name=="p":
  217. list_obj.append(_obj)
  218. else:
  219. self.get_soup_objs(_obj,list_obj)
  220. return list_obj
  221. def fix_tree(self,_product):
  222. products = extract_products(self.tree,_product)
  223. if len(products)>0:
  224. self.tree = self.buildParsetree(self.list_obj,products,self.auto_merge_table)
  225. def print_tree(self,tree,append=""):
  226. self.set_tree_id = set()
  227. if append=="":
  228. for t in tree:
  229. logger.debug("%s text:%s title:%s title_text:%s before:%s after%s product:%s"%("==>",t["text"][:50],t["sentence_title"],t["sentence_title_text"],t["title_before"],t["title_after"],t["has_product"]))
  230. for t in tree:
  231. _id = id(t)
  232. if _id in self.set_tree_id:
  233. continue
  234. self.set_tree_id.add(_id)
  235. logger.info("%s text:%s title:%s title_text:%s before:%s after%s product:%s"%(append,t["text"][:50],t["sentence_title"],t["sentence_title_text"],t["title_before"],t["title_after"],t["has_product"]))
  236. childs = t["child_title"]
  237. self.print_tree(childs,append=append+"-|")
  238. def is_title_first(self,title):
  239. if title in ("一","1","Ⅰ","a","A"):
  240. return True
  241. return False
  242. def find_title_by_pattern(self,_text,_pattern="(^|★|▲|:|:|\s+)(?P<title_1>(?P<title_1_index_0_0>第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章册包标部.::、、]+))|" \
  243. "([\s★▲\*]*)(?P<title_3>(?P<title_3_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?)(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_3_index_0_2>[、章册包标部.::、、]+))|" \
  244. "([\s★▲\*]*)(?P<title_4>(?P<title_4_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节章册部\.::、、]+))|" \
  245. "([\s★▲\*]*)(?P<title_5>(?P<title_5_index_0_0>^)(?P<title_5_index_1_1>[一二三四五六七八九十]+)(?P<title_5_index_2_0>)[^一二三四五六七八九十节章册部\.::、、])|" \
  246. "([\s★▲\*]*)(?P<title_12>(?P<title_12_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_12_index_1_1>\d{1,2})(?P<title_12_index_2_0>[\..、\s\-]?))|"\
  247. "([\s★▲\*]*)(?P<title_11>(?P<title_11_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\..、\s\-]?))|" \
  248. "([\s★▲\*]*)(?P<title_10>(?P<title_10_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\..、\s\-]?))|" \
  249. "([\s★▲\*]*)(?P<title_7>(?P<title_7_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\..包标::、\s\-]*))|" \
  250. "(^[\s★▲\*]*)(?P<title_6>(?P<title_6_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?包?)(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_2_0>[\..、\s\-包标]*))|" \
  251. "([\s★▲\*]*)(?P<title_15>(?P<title_15_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>[))包标\..::、]+))|" \
  252. "([\s★▲\*]+)(?P<title_17>(?P<title_17_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_17_index_1_1>[a-zA-Z]+)(?P<title_17_index_2_0>[))包标\..::、]+))|" \
  253. "([\s★▲\*]*)(?P<title_19>(?P<title_19_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>[))]))"
  254. ):
  255. _se = re.search(_pattern,_text)
  256. groups = []
  257. if _se is not None:
  258. _gd = _se.groupdict()
  259. for k,v in _gd.items():
  260. if v is not None:
  261. groups.append((k,v))
  262. if len(groups):
  263. # groups.sort(key=lambda x:x[0])
  264. return groups
  265. return None
  266. def make_increase(self,_sort,_title,_add=1):
  267. if len(_title)==0 and _add==0:
  268. return ""
  269. if len(_title)==0 and _add==1:
  270. return _sort[0]
  271. _index = _sort.index(_title[-1])
  272. next_index = (_index+_add)%len(_sort)
  273. next_chr = _sort[next_index]
  274. if _index==len(_sort)-1:
  275. _add = 1
  276. else:
  277. _add = 0
  278. return next_chr+self.make_increase(_sort,_title[:-1],_add)
  279. def get_next_title(self,_title):
  280. if re.search("^\d+$",_title) is not None:
  281. return str(int(_title)+1)
  282. if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
  283. if _title[-1]=="十":
  284. return _title+"一"
  285. if _title[-1]=="百":
  286. return _title+"零一"
  287. if _title[-1]=="九":
  288. if len(_title)==1:
  289. return "十"
  290. if len(_title)==2:
  291. if _title[0]=="十":
  292. return "二十"
  293. if len(_title)==3:
  294. if _title[0]=="九":
  295. return "一百"
  296. else:
  297. _next_title = self.make_increase(['一','二','三','四','五','六','七','八','九','十'],re.sub("[十百]",'',_title[0]))
  298. return _next_title+"十"
  299. _next_title = self.make_increase(['一','二','三','四','五','六','七','八','九','十'],re.sub("[十百]",'',_title))
  300. _next_title = list(_next_title)
  301. _next_title.reverse()
  302. if _next_title[-1]!="十":
  303. if len(_next_title)>=2:
  304. _next_title.insert(-1,'十')
  305. if len(_next_title)>=4:
  306. _next_title.insert(-3,'百')
  307. if _title[0]=="十":
  308. if _next_title=="十":
  309. _next_title = ["二","十"]
  310. _next_title.insert(0,"十")
  311. _next_title = "".join(_next_title)
  312. return _next_title
  313. if re.search("^[a-z]+$",_title) is not None:
  314. _next_title = self.make_increase([chr(i+ord('a')) for i in range(26)],_title)
  315. _next_title = list(_next_title)
  316. _next_title.reverse()
  317. return "".join(_next_title)
  318. if re.search("^[A-Z]+$",_title) is not None:
  319. _next_title = self.make_increase([chr(i+ord('A')) for i in range(26)],_title)
  320. _next_title = list(_next_title)
  321. _next_title.reverse()
  322. return "".join(_next_title)
  323. if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None:
  324. _sort = ["Ⅰ","Ⅱ","Ⅲ","Ⅳ","Ⅴ","Ⅵ","Ⅶ","Ⅷ","Ⅸ","Ⅹ","Ⅺ","Ⅻ"]
  325. _index = _sort.index(_title)
  326. if _index<len(_sort)-1:
  327. return _sort[_index+1]
  328. return None
  329. def count_title_before(self,list_obj):
  330. dict_before = {}
  331. dict_sentence_count = {}
  332. illegal_sentence = set()
  333. for obj_i in range(len(list_obj)):
  334. obj = list_obj[obj_i]
  335. _type = "sentence"
  336. _text = obj.text.strip()
  337. if obj.name=="table":
  338. _type = "table"
  339. _text = str(obj)
  340. _append = False
  341. if _type=="sentence":
  342. if len(_text)>10 and len(_text)<100:
  343. if _text not in dict_sentence_count:
  344. dict_sentence_count[_text] = 0
  345. dict_sentence_count[_text] += 1
  346. if re.search("\d+页",_text) is not None:
  347. illegal_sentence.add(_text)
  348. elif len(_text)<10:
  349. if re.search("第\d+页",_text) is not None:
  350. illegal_sentence.add(_text)
  351. sentence_groups = self.find_title_by_pattern(_text[:10])
  352. if sentence_groups:
  353. # c062f53cf83401e671822003d63c1828print("sentence_groups",sentence_groups)
  354. sentence_title = sentence_groups[0][0]
  355. sentence_title_text = sentence_groups[0][1]
  356. title_index = sentence_groups[-2][1]
  357. title_before = sentence_groups[1][1].replace("(","(").replace(":",":").replace(":",";").replace(",",".").replace(",",".").replace("、",".")
  358. title_after = sentence_groups[-1][1].replace(")",")").replace(":",":").replace(":",";").replace(",",".").replace(",",".").replace("、",".")
  359. next_index = self.get_next_title(title_index)
  360. if title_before not in dict_before:
  361. dict_before[title_before] = 0
  362. dict_before[title_before] += 1
  363. for k,v in dict_sentence_count.items():
  364. if v>10:
  365. illegal_sentence.add(k)
  366. return dict_before,illegal_sentence
  367. def is_page_no(self,sentence):
  368. if len(sentence)<10:
  369. if re.search("\d+页|^\-\d+\-$",sentence) is not None:
  370. return True
  371. def block_tree(self,childs):
  372. for child in childs:
  373. if not child["block"]:
  374. child["block"] = True
  375. childs2 = child["child_title"]
  376. self.block_tree(childs2)
  377. def buildParsetree(self,list_obj,products=[],auto_merge_table=True):
  378. self.parseTree = None
  379. trees = []
  380. list_length = []
  381. for obj in list_obj[:200]:
  382. if obj.name!="table":
  383. list_length.append(len(obj.get_text()))
  384. if len(list_length)>0:
  385. max_length = max(list_length)
  386. else:
  387. max_length = 40
  388. max_length = min(max_length,40)
  389. logger.debug("%s:%d"%("max_length",max_length))
  390. list_data = []
  391. last_table_index = None
  392. last_table_columns = None
  393. last_table = None
  394. dict_before,illegal_sentence = self.count_title_before(list_obj)
  395. for obj_i in range(len(list_obj)):
  396. obj = list_obj[obj_i]
  397. # logger.debug("==obj %s"%obj.text[:20])
  398. _type = "sentence"
  399. _text = standard_product(obj.text)
  400. if obj.name=="table":
  401. _type = "table"
  402. _text = standard_product(str(obj))
  403. _append = False
  404. sentence_title = None
  405. sentence_title_text = None
  406. sentence_groups = None
  407. title_index = None
  408. next_index = None
  409. parent_title = None
  410. title_before = None
  411. title_after = None
  412. title_next = None
  413. childs = []
  414. # new
  415. sentence_index = obj.sentence_index
  416. wordOffset_begin = obj.wordOffset_begin
  417. wordOffset_end = obj.wordOffset_end
  418. list_table = None
  419. block = False
  420. has_product = False
  421. if _type=="sentence":
  422. if _text in illegal_sentence:
  423. continue
  424. sentence_groups = self.find_title_by_pattern(_text[:10])
  425. if sentence_groups:
  426. title_before = standard_title_context(sentence_groups[1][1])
  427. title_after = sentence_groups[-1][1]
  428. sentence_title_text = sentence_groups[0][1]
  429. other_text = _text.replace(sentence_title_text,"")
  430. if (title_before in dict_before and dict_before[title_before]>1) or title_after!="":
  431. sentence_title = sentence_groups[0][0]
  432. title_index = sentence_groups[-2][1]
  433. next_index = self.get_next_title(title_index)
  434. other_text = _text.replace(sentence_title_text,"")
  435. for p in products:
  436. if other_text.strip()==p.strip():
  437. has_product = True
  438. else:
  439. _fix = False
  440. for p in products:
  441. if other_text.strip()==p.strip():
  442. title_before = "=产品"
  443. sentence_title = "title_0"
  444. sentence_title_text = p
  445. title_index = "0"
  446. title_after = "产品="
  447. next_index = "0"
  448. _fix = True
  449. has_product = True
  450. break
  451. if not _fix:
  452. title_before = None
  453. title_after = None
  454. sentence_title_text = None
  455. else:
  456. if len(_text)<40 and re.search(_param_pattern,_text) is not None:
  457. for p in products:
  458. if _text.find(p)>=0:
  459. title_before = "=产品"
  460. sentence_title = "title_0"
  461. sentence_title_text = p
  462. title_index = "0"
  463. title_after = "产品="
  464. next_index = "0"
  465. _fix = True
  466. has_product = True
  467. break
  468. if _type=="sentence":
  469. if sentence_title is None and len(list_data)>0 and list_data[-1]["sentence_title"] is not None and list_data[-1]["line_width"]>=max_length*0.6:
  470. list_data[-1]["text"] += _text
  471. list_data[-1]["line_width"] = len(_text)
  472. _append = True
  473. elif sentence_title is None and len(list_data)>0 and _type==list_data[-1]["type"]:
  474. if list_data[-1]["line_width"]>=max_length*0.7:
  475. list_data[-1]["text"] += _text
  476. list_data[-1]["line_width"] = len(_text)
  477. _append = True
  478. if _type=="table":
  479. _soup = BeautifulSoup(_text,"lxml")
  480. _table = _soup.find("table")
  481. if _table is not None:
  482. list_table = getTable(_table)
  483. if len(list_table)==0:
  484. continue
  485. table_columns = len(list_table[0])
  486. if auto_merge_table:
  487. if last_table_index is not None and abs(obj_i-last_table_index)<=2 and last_table_columns is not None and last_table_columns==table_columns:
  488. if last_table is not None:
  489. trs = getTrs(_table)
  490. last_tbody = BeautifulSoup(last_table["text"],"lxml")
  491. _table = last_tbody.find("table")
  492. last_trs = getTrs(_table)
  493. _append = True
  494. for _line in list_table:
  495. last_table["list_table"].append(_line)
  496. if len(last_trs)>0:
  497. for _tr in trs:
  498. last_trs[-1].insert_after(copy.copy(_tr))
  499. last_table["text"] = re.sub("</?html>|</?body>","",str(last_tbody))
  500. last_table_index = obj_i
  501. last_table_columns = len(list_table[-1])
  502. if not _append:
  503. _data = {"type":_type, "text":_text,"list_table":list_table,"line_width":len(_text),"sentence_title":sentence_title,"title_index":title_index,
  504. "sentence_title_text":sentence_title_text,"sentence_groups":sentence_groups,"parent_title":parent_title,
  505. "child_title":childs,"title_before":title_before,"title_after":title_after,"title_next":title_next,"next_index":next_index,
  506. "block":block,"has_product":has_product,
  507. "sentence_index":sentence_index,"wordOffset_begin":wordOffset_begin,"wordOffset_end":wordOffset_end
  508. }
  509. if _type=="table":
  510. last_table = _data
  511. last_table_index = obj_i
  512. if list_table:
  513. last_table_columns = last_table_columns = len(list_table[-1])
  514. if sentence_title is not None:
  515. if len(list_data)>0:
  516. if self.is_title_first(title_index):
  517. for i in range(1,len(list_data)+1):
  518. _d = list_data[-i]
  519. if _d["sentence_title"] is not None:
  520. _data["parent_title"] = _d
  521. _d["child_title"].append(_data)
  522. break
  523. else:
  524. _find = False
  525. for i in range(1,len(list_data)+1):
  526. if _find:
  527. break
  528. _d = list_data[-i]
  529. if _d.get("sentence_title")==sentence_title and title_before==_d["title_before"] and title_after==_d["title_after"]:
  530. if _d["next_index"]==title_index and _d["title_next"] is None and not _d["block"]:
  531. _data["parent_title"] = _d["parent_title"]
  532. _d["title_next"] = _data
  533. if len(_d["child_title"])>0:
  534. _d["child_title"][-1]["title_next"] = ""
  535. self.block_tree(_d["child_title"])
  536. if _d["parent_title"] is not None:
  537. _d["parent_title"]["child_title"].append(_data)
  538. _find = True
  539. break
  540. for i in range(1,len(list_data)+1):
  541. if _find:
  542. break
  543. _d = list_data[-i]
  544. if i==1 and not _d["block"] and _d.get("sentence_title")==sentence_title and title_before==_d["title_before"] and title_after==_d["title_after"]:
  545. _data["parent_title"] = _d["parent_title"]
  546. _d["title_next"] = _data
  547. if len(_d["child_title"])>0:
  548. _d["child_title"][-1]["title_next"] = ""
  549. self.block_tree(_d["child_title"])
  550. if _d["parent_title"] is not None:
  551. _d["parent_title"]["child_title"].append(_data)
  552. _find = True
  553. break
  554. title_before = standard_title_context(title_before)
  555. title_after = standard_title_context(title_after)
  556. for i in range(1,len(list_data)+1):
  557. if _find:
  558. break
  559. _d = list_data[-i]
  560. if _d.get("sentence_title")==sentence_title and title_before==standard_title_context(_d["title_before"]) and title_after==standard_title_context(_d["title_after"]):
  561. if _d["next_index"]==title_index and _d["title_next"] is None and not _d["block"]:
  562. _data["parent_title"] = _d["parent_title"]
  563. _d["title_next"] = _data
  564. if len(_d["child_title"])>0:
  565. _d["child_title"][-1]["title_next"] = ""
  566. self.block_tree(_d["child_title"])
  567. if _d["parent_title"] is not None:
  568. _d["parent_title"]["child_title"].append(_data)
  569. _find = True
  570. break
  571. for i in range(1,len(list_data)+1):
  572. if _find:
  573. break
  574. _d = list_data[-i]
  575. if not _d["block"] and _d.get("sentence_title")==sentence_title and title_before==standard_title_context(_d["title_before"]) and title_after==standard_title_context(_d["title_after"]):
  576. _data["parent_title"] = _d["parent_title"]
  577. _d["title_next"] = _data
  578. if len(_d["child_title"])>0:
  579. _d["child_title"][-1]["title_next"] = ""
  580. # self.block_tree(_d["child_title"])
  581. if _d["parent_title"] is not None:
  582. _d["parent_title"]["child_title"].append(_data)
  583. _find = True
  584. break
  585. for i in range(1,min(len(list_data)+1,20)):
  586. if _find:
  587. break
  588. _d = list_data[-i]
  589. if not _d["block"] and _d.get("sentence_title")==sentence_title and title_before==standard_title_context(_d["title_before"]):
  590. _data["parent_title"] = _d["parent_title"]
  591. _d["title_next"] = _data
  592. if len(_d["child_title"])>0:
  593. _d["child_title"][-1]["title_next"] = ""
  594. # self.block_tree(_d["child_title"])
  595. if _d["parent_title"] is not None:
  596. _d["parent_title"]["child_title"].append(_data)
  597. _find = True
  598. break
  599. if not _find:
  600. if len(list_data)>0:
  601. for i in range(1,len(list_data)+1):
  602. _d = list_data[-i]
  603. if _d.get("sentence_title") is not None:
  604. _data["parent_title"] = _d
  605. _d["child_title"].append(_data)
  606. break
  607. else:
  608. if len(list_data)>0:
  609. for i in range(1,len(list_data)+1):
  610. _d = list_data[-i]
  611. if _d.get("sentence_title") is not None:
  612. _data["parent_title"] = _d
  613. _d["child_title"].append(_data)
  614. break
  615. list_data.append(_data)
  616. for _data in list_data:
  617. childs = _data["child_title"]
  618. for c_i in range(len(childs)):
  619. cdata = childs[c_i]
  620. if cdata["has_product"]:
  621. continue
  622. else:
  623. if c_i>0:
  624. last_cdata = childs[c_i-1]
  625. if cdata["sentence_title"] is not None and last_cdata["sentence_title"] is not None and last_cdata["title_before"]==cdata["title_before"] and last_cdata["title_after"]==cdata["title_after"] and last_cdata["has_product"]:
  626. cdata["has_product"] = True
  627. if c_i<len(childs)-1:
  628. last_cdata = childs[c_i+1]
  629. if cdata["sentence_title"] is not None and last_cdata["sentence_title"] is not None and last_cdata["title_before"]==cdata["title_before"] and last_cdata["title_after"]==cdata["title_after"] and last_cdata["has_product"]:
  630. cdata["has_product"] = True
  631. for c_i in range(len(childs)):
  632. cdata = childs[len(childs)-1-c_i]
  633. if cdata["has_product"]:
  634. continue
  635. else:
  636. if c_i>0:
  637. last_cdata = childs[c_i-1]
  638. if cdata["sentence_title"] is not None and last_cdata["sentence_title"] is not None and last_cdata["title_before"]==cdata["title_before"] and last_cdata["title_after"]==cdata["title_after"] and last_cdata["has_product"]:
  639. cdata["has_product"] = True
  640. if c_i<len(childs)-1:
  641. last_cdata = childs[c_i+1]
  642. if cdata["sentence_title"] is not None and last_cdata["sentence_title"] is not None and last_cdata["title_before"]==cdata["title_before"] and last_cdata["title_after"]==cdata["title_after"] and last_cdata["has_product"]:
  643. cdata["has_product"] = True
  644. return list_data
  645. def standard_title_context(_title_context):
  646. return _title_context.replace("(","(").replace(")",")").replace(":",":").replace(":",";").replace(",",".").replace(",",".").replace("、",".").replace(".",".")
  647. def standard_product(sentence):
  648. return sentence.replace("(","(").replace(")",")")
  649. def extract_products(list_data,_product,_param_pattern = "产品名称|设备材料|采购内存|标的名称|采购内容|(标的|维修|系统|报价构成|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|配件|资产|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名|气体|标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|中标|成交|工程|招标内容)[\))的]?([、\w]{,4}名称|内容|描述)|标的|标项|项目$|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品|物装|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|菜名|^品目$|^品名$|^名称|^内容$"):
  650. _product = standard_product(_product)
  651. list_result = []
  652. list_table_products = []
  653. for _data_i in range(len(list_data)):
  654. _data = list_data[_data_i]
  655. _type = _data["type"]
  656. _text = _data["text"]
  657. if _type=="table":
  658. list_table = _data["list_table"]
  659. if list_table is None:
  660. continue
  661. _check = True
  662. max_length = max([len(a) for a in list_table])
  663. min_length = min([len(a) for a in list_table])
  664. if min_length<max_length/2:
  665. continue
  666. list_head_index = []
  667. _begin_index = 0
  668. head_cell_text = ""
  669. for line_i in range(len(list_table[:2])):
  670. line = list_table[line_i]
  671. line_text = ",".join([cell[0] for cell in line])
  672. for cell_i in range(len(line)):
  673. cell = line[cell_i]
  674. cell_text = cell[0]
  675. if len(cell_text)<10 and re.search(_param_pattern,cell_text) is not None and re.search("单价|数量|预算|限价|总价|品牌|规格|型号|用途|要求|采购量",line_text) is not None:
  676. _begin_index = line_i+1
  677. list_head_index.append(cell_i)
  678. for line_i in range(len(list_table)):
  679. line = list_table[line_i]
  680. for cell_i in list_head_index:
  681. if cell_i>=len(line):
  682. continue
  683. cell = line[cell_i]
  684. cell_text = cell[0]
  685. head_cell_text += cell_text
  686. # print("===head_cell_text",head_cell_text)
  687. if re.search("招标人|采购人|项目编号|项目名称|金额|^\d+$",head_cell_text) is not None:
  688. list_head_index = []
  689. for line in list_table:
  690. line_text = ",".join([cell[0] for cell in line])
  691. for cell_i in range(len(line)):
  692. cell = line[cell_i]
  693. cell_text = cell[0]
  694. if cell_text is not None and _product is not None and len(cell_text)<len(_product)*10 and cell_text.find(_product)>=0 and re.search("单价|数量|总价|规格|品牌|型号|用途|要求|采购量",line_text) is not None:
  695. list_head_index.append(cell_i)
  696. list_head_index = list(set(list_head_index))
  697. if len(list_head_index)>0:
  698. has_number = False
  699. for cell_i in list_head_index:
  700. table_products = []
  701. for line_i in range(_begin_index,len(list_table)):
  702. line = list_table[line_i]
  703. for _i in range(len(line)):
  704. cell = line[_i]
  705. cell_text = cell[0]
  706. if re.search("^\d+$",cell_text) is not None:
  707. has_number = True
  708. if cell_i>=len(line):
  709. continue
  710. cell = line[cell_i]
  711. cell_text = cell[0]
  712. if re.search(_param_pattern,cell_text) is None or has_number:
  713. if re.search("^[\da-zA-Z]+$",cell_text) is None:
  714. table_products.append(cell_text)
  715. if len(table_products)>0:
  716. logger.debug("table products %s"%(str(table_products)))
  717. if min([len(x) for x in table_products])>0 and max([len(x) for x in table_products])<=30:
  718. if re.search("招标人|代理人|预算|数量|交货期|品牌|产地","".join(table_products)) is None:
  719. list_table_products.append(table_products)
  720. _find = False
  721. for table_products in list_table_products:
  722. for _p in table_products:
  723. if is_similar(_product,_p,90):
  724. _find = True
  725. logger.debug("similar table_products %s"%(str(table_products)))
  726. list_result = list(set([a for a in table_products if len(a)>1 and len(a)<20 and re.search("费用|预算|合计|金额|万元|运费|^其他$",a) is None]))
  727. break
  728. if not _find:
  729. for table_products in list_table_products:
  730. list_result.extend(table_products)
  731. list_result = list(set([a for a in list_result if len(a)>1 and len(a)<30 and re.search("费用|预算|合计|金额|万元|运费",a) is None]))
  732. return list_result
  733. def get_childs(childs):
  734. list_data = []
  735. for _child in childs:
  736. list_data.append(_child)
  737. childs2 = _child.get("child_title",[])
  738. if len(childs2)>0:
  739. for _child2 in childs2:
  740. list_data.extend(get_childs([_child2]))
  741. return list_data
  742. def get_range_data_by_childs(list_data,childs):
  743. range_data = []
  744. list_child = get_childs(childs)
  745. list_index = []
  746. set_child = set([id(x) for x in list_child])
  747. for _data_i in range(len(list_data)):
  748. _data = list_data[_data_i]
  749. _id = id(_data)
  750. if _id in set_child:
  751. list_index.append(_data_i)
  752. if len(list_index)>0:
  753. range_data = list_data[min(list_index):max(list_index)+1]
  754. return range_data
  755. def get_correct_product(product,products):
  756. list_data = []
  757. for p in products:
  758. is_sim = is_similar(product,p)
  759. _d = {"product":p,"distance":abs(len(product)-len(p)),"is_sim":is_sim}
  760. list_data.append(_d)
  761. list_data.sort(key=lambda x:x["distance"])
  762. for _d in list_data:
  763. is_sim = _d["is_sim"]
  764. if is_sim:
  765. if len(_d["product"])>len(product) and _d["product"].find(product)>=0:
  766. return product
  767. return _d["product"]
  768. return product
  769. def get_childs_text(childs,_product,products,is_begin=False,is_end=False):
  770. _text = ""
  771. end_next = False
  772. for _child in childs:
  773. child_text = _child.get("text")
  774. if child_text.find(_product)>=0:
  775. if not is_begin:
  776. is_begin = True
  777. if not end_next:
  778. if _child["sentence_title"] is not None and isinstance(_child["title_next"],dict) and _child["title_next"]["sentence_title"] is not None:
  779. end_next = True
  780. end_title = _child["title_next"]
  781. logger.debug("end_title %s "%end_title["text"])
  782. logger.debug("%s-%s-%s"%("get_childs_text",child_text[:10],str(is_begin)))
  783. for p in products:
  784. if child_text.find(p)>=0 and is_similar(_product,p,90):
  785. is_begin = True
  786. if child_text.find(_product)<0 and not is_similar(_product,p,80) and (child_text.find(p)>=0 or _child["has_product"]):
  787. if is_begin:
  788. is_end = True
  789. logger.debug("%s-%s-%s"%("get_childs_text end",child_text[:10],p))
  790. break
  791. if re.search(end_pattern,child_text) is not None:
  792. if is_begin:
  793. is_end = True
  794. logger.debug("%s-%s-%s"%("get_childs_text end",child_text[:10],str(is_end)))
  795. if is_begin and is_end:
  796. break
  797. if is_begin:
  798. _text += _child.get("text")+"\r\n"
  799. childs2 = _child.get("child_title",[])
  800. if len(childs2)>0:
  801. for _child2 in childs2:
  802. child_text,is_begin,is_end = get_childs_text([_child2],_product,products,is_begin)
  803. if is_begin:
  804. _text += child_text
  805. if is_end:
  806. break
  807. if end_next:
  808. is_end = True
  809. # logger.debug("%s-%s-%s"%("get_childs_text1",_text,str(is_begin)))
  810. # logger.debug("%s-%s-%s"%("get_childs_text2",_text,str(is_begin)))
  811. return _text,is_begin,is_end
  812. def extract_parameters_by_tree(_product,products,list_data,_data_i,parent_title,list_result,):
  813. _data = list_data[_data_i]
  814. childs = _data.get("child_title",[])
  815. if len(childs)>0:
  816. child_text,_,_ = get_childs_text([_data],_product,products)
  817. if len(child_text)>0:
  818. logger.info("extract_type by_tree child_text:%s"%child_text)
  819. list_result.append(child_text)
  820. if parent_title is not None:
  821. child_text,_,_ = get_childs_text([parent_title],_product,products)
  822. if len(child_text)>0:
  823. logger.info("extract_type by_tree child_text:%s"%child_text)
  824. list_result.append(child_text)
  825. childs = parent_title.get("child_title",[])
  826. if len(childs)>0:
  827. range_data = get_range_data_by_childs(list_data[_data_i:],childs)
  828. p_text = ""
  829. _find = False
  830. end_id = id(_data["title_next"]) if isinstance(_data["sentence_title"],dict) and _data["title_next"] is not None and _data["title_next"]["sentence_title"] is not None else None
  831. for pdata in range_data:
  832. ptext = pdata["text"]
  833. for p in products:
  834. if ptext.find(_product)<0 and (ptext.find(p)>=0 or pdata["has_product"]):
  835. _find = True
  836. break
  837. if re.search(end_pattern,ptext) is not None:
  838. _find = True
  839. if _find:
  840. break
  841. if id(pdata)==end_id:
  842. break
  843. p_text += ptext+"\r\n"
  844. if len(p_text)>0:
  845. logger.debug("extract_type by parent range_text:%s"%p_text)
  846. list_result.append(p_text)
  847. return True
  848. return False
  849. def get_table_pieces(_text,_product,products,list_result,_find):
  850. _soup = BeautifulSoup(_text,"lxml")
  851. _table = _soup.find("table")
  852. if _table is not None:
  853. trs = getTrs(_table)
  854. list_trs = []
  855. for tr in trs:
  856. tr_text = tr.get_text()
  857. if tr_text.find(_product)>=0:
  858. _find = True
  859. logger.debug("%s-%s"%("table_html_tr",tr_text))
  860. for p in products:
  861. if _find and p!=_product and tr_text.find(p)>=0:
  862. _find = False
  863. break
  864. if re.search(end_pattern,tr_text) is not None:
  865. _find = False
  866. break
  867. if _find:
  868. list_trs.append(tr)
  869. if len(list_trs)>0:
  870. table_html = "<table>%s</table>"%("\r\n".join([str(a) for a in list_trs]))
  871. logger.debug("extract_type table slices %s"%(table_html))
  872. list_result.append(table_html)
  873. def extract_parameters_by_table(_product,products,_param_pattern,list_data,_data_i,list_result):
  874. _data = list_data[_data_i]
  875. _text = _data["text"]
  876. list_table = _data["list_table"]
  877. parent_title = _data["parent_title"]
  878. if list_table is not None:
  879. _check = True
  880. max_length = max([len(a) for a in list_table])
  881. min_length = min([len(a) for a in list_table])
  882. text_line_first = ",".join(a[0] for a in list_table[0])
  883. if max_length>10:
  884. if min_length<max_length/2:
  885. return
  886. last_data = list_data[_data_i-1]
  887. _flag = False
  888. if last_data["type"]=="sentence" and last_data["text"].find(_product)>=0:
  889. logger.debug("last sentence find product %s-%s"%(_product,last_data["text"]))
  890. _flag = True
  891. # print(text_line_first,"text_line_first",re.search(_param_pattern,text_line_first) is not None and text_line_first.find(_product)>=0)
  892. if re.search(_param_pattern,text_line_first) is not None and text_line_first.find(_product)>=0:
  893. _flag = True
  894. if _flag:
  895. if len(products)==0:
  896. logger.debug("extract_type whole table by param and product %s"%(_text))
  897. list_result.append(_text)
  898. else:
  899. for p in products:
  900. if p!=_product and _text.find(p)>=0:
  901. logger.debug("extract_type add all table failed %s-%s"%(_product,p))
  902. _flag = False
  903. break
  904. if _flag:
  905. logger.debug("extract_type add all table succeed")
  906. get_table_pieces(_text,_product,products,list_result,True)
  907. else:
  908. list_head_index = []
  909. for line in list_table[:2]:
  910. for cell_i in range(len(line)):
  911. cell = line[cell_i]
  912. cell_text = cell[0]
  913. if len(cell_text)<20 and re.search(_param_pattern,cell_text) is not None:
  914. list_head_index.append(cell_i)
  915. list_head_index = list(set(list_head_index))
  916. for line in list_table:
  917. for cell in line:
  918. cell_text = cell[0]
  919. if len(cell_text)>50 and len(re.findall(meter_pattern,cell_text))>5 and cell_text.find(_product)>=0:
  920. _f = True
  921. for cell in line:
  922. if not _f:
  923. break
  924. cell_text = cell[0]
  925. for p in products:
  926. if cell_text.find(p)>=0 and p!=_product:
  927. _f = False
  928. break
  929. if _f:
  930. logger.debug("extract_type param column %s"%(cell_text))
  931. list_result.append(cell_text)
  932. if len(cell_text)<len(_product)*10 and str(cell_text).find(_product)>=0:
  933. for _index in list_head_index:
  934. if _index>=len(line):
  935. continue
  936. _cell = line[_index]
  937. if len(cell[0])>0:
  938. logger.info("%s-%s"%("extract_type add on table text:",_cell[0]))
  939. list_result.append(_cell[0])
  940. if not _flag and (re.search(_param_pattern,_text) is not None or (parent_title is not None and re.search(_param_pattern,parent_title["text"]) is not None)) and _text.find(_product)>=0:
  941. get_table_pieces(_text,_product,products,list_result,False)
  942. def extract_parameters_by_sentence(list_data,_data,_data_i,_product,products,list_result,is_project):
  943. _text = _data["text"]
  944. if _text.find(_product)>=0:
  945. parent_title = _data.get("parent_title")
  946. parent_text = ""
  947. parent_parent_title = None
  948. parent_parent_text = ""
  949. parent_title_index = None
  950. parent_parent_title_index = None
  951. childs = get_childs([_data])
  952. child_find = False
  953. for c in childs:
  954. if re.search(_param_pattern,c["text"]) is not None and len(c["text"])<30:
  955. logger.debug("child text %s"%(c["text"]))
  956. child_find = True
  957. break
  958. extract_text,_,_ = get_childs_text([_data],_product,products)
  959. logger.debug("childs found extract_text %s %s"%(str(child_find),extract_text))
  960. if child_find:
  961. if len(extract_text)>0:
  962. list_result.append(extract_text)
  963. else:
  964. limit_nums = len(_product)*2+5
  965. if len(_product)<=3:
  966. limit_nums += 6
  967. if _text.find("数量")>=0:
  968. limit_nums += 6
  969. if len(_text)<=limit_nums and _data["sentence_title"] is not None:
  970. if re.search(meter_pattern,extract_text) is not None:
  971. list_result.append(extract_text)
  972. elif len(re.findall(meter_pattern,extract_text))>2:
  973. list_result.append(extract_text)
  974. if parent_title is not None:
  975. parent_text = parent_title.get("text","")
  976. parent_parent_title = parent_title.get("parent_title")
  977. parent_title_index = parent_title["title_index"]
  978. if parent_parent_title is not None:
  979. parent_parent_text = parent_parent_title.get("text","")
  980. parent_parent_title_index = parent_parent_title["title_index"]
  981. _suit = False
  982. if re.search(_param_pattern,_text) is not None and len(_text)<50:
  983. _suit = True
  984. if re.search(_param_pattern,parent_text) is not None and len(parent_text)<50:
  985. _suit = True
  986. if re.search(_param_pattern,parent_parent_text) is not None and len(parent_parent_text)<50:
  987. _suit = True
  988. if _suit:
  989. logger.debug("extract_type sentence %s"%("extract_parameters_by_tree"))
  990. if not extract_parameters_by_tree(_product,products,list_data,_data_i,parent_title,list_result):
  991. logger.debug("extract_type sentence %s"%("extract_parameters_by_tree"))
  992. extract_parameters_by_tree(_product,products,list_data,_data_i,parent_parent_title,list_result)
  993. if re.search(_param_pattern,_text) is not None and len(_text)<50:
  994. childs = _data["child_title"]
  995. if len(childs)>0:
  996. extract_text,_,_ = get_childs_text([_data],_product,products)
  997. if len(extract_text)>0:
  998. logger.debug("extract_type param-product %s"%(extract_text))
  999. list_result.append(extract_text)
  1000. elif is_project:
  1001. extract_text,_,_ = get_childs_text([_data],_product,products,is_begin=True)
  1002. if len(extract_text)>0 and re.search(meter_pattern,extract_text) is not None:
  1003. logger.debug("extract_type sentence is_project param-product is product %s"%(extract_text))
  1004. list_result.append(extract_text)
  1005. def getBestProductText(list_result,_product,products):
  1006. list_result.sort(key=lambda x:len(re.findall(meter_pattern+"|"+'[::;;]|\d+[%A-Za-z]+',BeautifulSoup(x,"lxml").get_text())), reverse=True)
  1007. logger.debug("+++++++++++++++++++++")
  1008. for i in range(len(list_result)):
  1009. logger.debug("result%d %s"%(i,list_result[i]))
  1010. logger.debug("+++++++++++++++++++++")
  1011. for i in range(len(list_result)):
  1012. _result = list_result[i]
  1013. _check = True
  1014. _result_text = BeautifulSoup(_result,"lxml").get_text()
  1015. _search = re.search("项目编号[::]|项目名称[::]|联合体投标|开户银行",_result)
  1016. if _search is not None:
  1017. logger.debug("result%d error illegal text %s"%(i,str(_search)))
  1018. _check = False
  1019. if not (len(_result_text)<1000 and _result[:6]!="<table"):
  1020. for p in products:
  1021. if _result_text.find(p)>0 and not (is_similar(_product,p,80) or p.find(_product)>=0 or _product.find(p)>=0):
  1022. logger.debug("result%d error product scoss %s"%(i,p))
  1023. _check = False
  1024. if len(_result_text)<100:
  1025. if re.search(meter_pattern,_result_text) is None:
  1026. logger.debug("result%d error text min count"%(i))
  1027. _check = False
  1028. if len(_result_text)>5000:
  1029. if len(_result_text)>10000:
  1030. logger.debug("result%d error text max count"%(i))
  1031. _check = False
  1032. elif len(re.findall(meter_pattern,_result_text))<10:
  1033. logger.debug("result%d error text max count less meter"%(i))
  1034. _check = False
  1035. list_find = list(set(re.findall(meter_pattern,_result_text)))
  1036. not_list_find = list(set(re.findall(not_meter_pattern,_result_text)))
  1037. _count = len(list_find)-len(not_list_find)
  1038. has_num = False
  1039. for _find in list_find:
  1040. if re.search('[0-9a-zA-Z]',_find) is not None:
  1041. has_num = True
  1042. break
  1043. if not(_count>=2 and has_num or _count>=5):
  1044. logger.debug("result%d error match not enough"%(i))
  1045. _check = False
  1046. if _check:
  1047. return _result
  1048. def format_text(_result):
  1049. list_result = re.split("\r|\n",_result)
  1050. _result = ""
  1051. for _r in list_result:
  1052. if len(_r)>0:
  1053. _result+="%s\n"%(_r)
  1054. _result = '<div style="white-space:pre">%s</div>'%(_result)
  1055. return _result
  1056. def extract_product_parameters(list_data,_product):
  1057. list_result = []
  1058. _product = standard_product(_product.strip())
  1059. products = extract_products(list_data,_product)
  1060. _product = get_correct_product(_product,products)
  1061. logger.debug("all products %s-%s"%(_product,str(products)))
  1062. is_project = False
  1063. if re.search("项目名称|采购项目",_product) is not None:
  1064. is_project = True
  1065. if len(products)==1 and is_similar(products[0],_product,90):
  1066. is_project = True
  1067. _find_count = 0
  1068. for _data_i in range(len(list_data)):
  1069. _data = list_data[_data_i]
  1070. _type = _data["type"]
  1071. _text = _data["text"]
  1072. if _type=="sentence":
  1073. if _text.find(_product)>=0:
  1074. _find_count += 1
  1075. if re.search("项目名称|采购项目",_text) is not None and re.search("等",_text) is not None:
  1076. is_project = True
  1077. extract_parameters_by_sentence(list_data,_data,_data_i,_product,products,list_result,is_project)
  1078. elif _type=="table":
  1079. if _text.find(_product)>=0:
  1080. _find_count += 1
  1081. extract_parameters_by_table(_product,products,_param_pattern,list_data,_data_i,list_result)
  1082. _text = getBestProductText(list_result,_product,products)
  1083. return _text,_find_count
  1084. if __name__ == '__main__':
  1085. filepath = "download/4597dcc128bfabc7584d10590ae50656.html"
  1086. _product = "彩色多普勒超声诊断仪"
  1087. _html = open(filepath, "r", encoding="utf8").read()
  1088. pd = ParseDocument(_html,False)
  1089. pd.fix_tree(_product)
  1090. list_data = pd.tree
  1091. pd.print_tree(list_data)
  1092. _text,_count = extract_product_parameters(list_data,_product)
  1093. logger.info("find count:%d"%(_count))
  1094. logger.info("extract_parameter_text::%s"%(_text))