|
@@ -6,6 +6,7 @@ import logging
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
|
|
|
|
|
|
|
|
|
+
|
|
from bs4 import BeautifulSoup
|
|
from bs4 import BeautifulSoup
|
|
import copy
|
|
import copy
|
|
|
|
|
|
@@ -144,7 +145,7 @@ class ParseDocument():
|
|
if _id in self.set_tree_id:
|
|
if _id in self.set_tree_id:
|
|
continue
|
|
continue
|
|
self.set_tree_id.add(_id)
|
|
self.set_tree_id.add(_id)
|
|
- print(append,t["text"][:20])
|
|
|
|
|
|
+ print(append,t["text"][:50],t["sentence_title"])
|
|
childs = t["child_title"]
|
|
childs = t["child_title"]
|
|
self.print_tree(childs,append=append+" ")
|
|
self.print_tree(childs,append=append+" ")
|
|
|
|
|
|
@@ -154,15 +155,16 @@ class ParseDocument():
|
|
return False
|
|
return False
|
|
|
|
|
|
def find_title_by_pattern(self,_text,_pattern="(^|★|:|:|\s+)(?P<title_1>(?P<title_1_index_0_0>第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章册部\.::]))|" \
|
|
def find_title_by_pattern(self,_text,_pattern="(^|★|:|:|\s+)(?P<title_1>(?P<title_1_index_0_0>第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章册部\.::]))|" \
|
|
- "([\s★\*]*)(?P<title_3>(?P<title_3_index_0_0>.{,3}?)(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_3_index_0_2>))|" \
|
|
|
|
- "([\s★\*]*)(?P<title_4>(?P<title_4_index_0_0>.{,3}?第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节章册部\.::、、]))|" \
|
|
|
|
- "([\s★\*]*)(?P<title_11>(?P<title_11_index_0_0>.{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\..、\s\-]?))|" \
|
|
|
|
- "([\s★\*]*)(?P<title_10>(?P<title_10_index_0_0>.{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\..、\s\-]?))|" \
|
|
|
|
- "([\s★\*]*)(?P<title_7>(?P<title_7_index_0_0>.{,3}?\d{1,2}[\..、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\..、\s\-]?))|" \
|
|
|
|
- "([\s★\*]*)(?P<title_6>(?P<title_6_index_0_0>.{,3}?包?)(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_2_0>[\..、\s\-]?))|" \
|
|
|
|
- "([\s★\*]*)(?P<title_15>(?P<title_15_index_0_0>.{,3}?(?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>)))|" \
|
|
|
|
- "([\s★\*]*)(?P<title_17>(?P<title_17_index_0_0>.{,3}?(?)(?P<title_17_index_1_1>[a-wA-W]+)(?P<title_17_index_2_0>)))|" \
|
|
|
|
- "([\s★\*]*)(?P<title_19>(?P<title_19_index_0_0>.{,3}?(?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>)))" \
|
|
|
|
|
|
+ "([\s★\*]*)(?P<title_3>(?P<title_3_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?)(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_3_index_0_2>))|" \
|
|
|
|
+ "([\s★\*]*)(?P<title_4>(?P<title_4_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节章册部\.::、、]))|" \
|
|
|
|
+ "([\s★\*]*)(?P<title_12>(?P<title_12_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_12_index_1_1>\d{1,2})(?P<title_12_index_2_0>[\..、\s\-]?))|"\
|
|
|
|
+ "([\s★\*]*)(?P<title_11>(?P<title_11_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\..、\s\-]?))|" \
|
|
|
|
+ "([\s★\*]*)(?P<title_10>(?P<title_10_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\..、\s\-]?))|" \
|
|
|
|
+ "([\s★\*]*)(?P<title_7>(?P<title_7_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\..、\s\-]?))|" \
|
|
|
|
+ "([\s★\*]*)(?P<title_6>(?P<title_6_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?包?)(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_2_0>[\..、\s\-]?))|" \
|
|
|
|
+ "([\s★\*]*)(?P<title_15>(?P<title_15_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>[))]))|" \
|
|
|
|
+ "([\s★\*]*)(?P<title_17>(?P<title_17_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_17_index_1_1>[a-wA-W]+)(?P<title_17_index_2_0>[))]))|" \
|
|
|
|
+ "([\s★\*]*)(?P<title_19>(?P<title_19_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>[))]))" \
|
|
):
|
|
):
|
|
_se = re.search(_pattern,_text)
|
|
_se = re.search(_pattern,_text)
|
|
groups = []
|
|
groups = []
|
|
@@ -172,7 +174,7 @@ class ParseDocument():
|
|
if v is not None:
|
|
if v is not None:
|
|
groups.append((k,v))
|
|
groups.append((k,v))
|
|
if len(groups):
|
|
if len(groups):
|
|
- groups.sort(key=lambda x:x[0])
|
|
|
|
|
|
+ # groups.sort(key=lambda x:x[0])
|
|
return groups
|
|
return groups
|
|
return None
|
|
return None
|
|
|
|
|
|
@@ -190,10 +192,30 @@ class ParseDocument():
|
|
_add = 0
|
|
_add = 0
|
|
return next_chr+self.make_increase(_sort,_title[:-1],_add)
|
|
return next_chr+self.make_increase(_sort,_title[:-1],_add)
|
|
|
|
|
|
|
|
+
|
|
def get_next_title(self,_title):
|
|
def get_next_title(self,_title):
|
|
if re.search("^\d+$",_title) is not None:
|
|
if re.search("^\d+$",_title) is not None:
|
|
return str(int(_title)+1)
|
|
return str(int(_title)+1)
|
|
if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
|
|
if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
|
|
|
|
+ if _title[-1]=="十":
|
|
|
|
+ return _title+"一"
|
|
|
|
+ if _title[-1]=="百":
|
|
|
|
+ return _title+"零一"
|
|
|
|
+
|
|
|
|
+ if _title[-1]=="九":
|
|
|
|
+ if len(_title)==1:
|
|
|
|
+ return "十"
|
|
|
|
+ if len(_title)==2:
|
|
|
|
+ if _title[0]=="十":
|
|
|
|
+ return "二十"
|
|
|
|
+ if len(_title)==3:
|
|
|
|
+ if _title[0]=="九":
|
|
|
|
+ return "一百"
|
|
|
|
+ else:
|
|
|
|
+ _next_title = self.make_increase(['一','二','三','四','五','六','七','八','九','十'],re.sub("[十百]",'',_title[0]))
|
|
|
|
+ print("=_next_title",_next_title)
|
|
|
|
+ return _next_title+"十"
|
|
|
|
+
|
|
_next_title = self.make_increase(['一','二','三','四','五','六','七','八','九','十'],re.sub("[十百]",'',_title))
|
|
_next_title = self.make_increase(['一','二','三','四','五','六','七','八','九','十'],re.sub("[十百]",'',_title))
|
|
_next_title = list(_next_title)
|
|
_next_title = list(_next_title)
|
|
_next_title.reverse()
|
|
_next_title.reverse()
|
|
@@ -225,6 +247,34 @@ class ParseDocument():
|
|
return _sort[_index+1]
|
|
return _sort[_index+1]
|
|
return None
|
|
return None
|
|
|
|
|
|
|
|
+ def count_title_before(self,list_obj):
|
|
|
|
+ dict_before = {}
|
|
|
|
+ for obj_i in range(len(list_obj)):
|
|
|
|
+ obj = list_obj[obj_i]
|
|
|
|
+ _type = "sentence"
|
|
|
|
+ _text = obj.text
|
|
|
|
+ if obj.name=="table":
|
|
|
|
+ _type = "table"
|
|
|
|
+ _text = str(obj)
|
|
|
|
+ _append = False
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ if _type=="sentence":
|
|
|
|
+ sentence_groups = self.find_title_by_pattern(_text[:10])
|
|
|
|
+ if sentence_groups:
|
|
|
|
+ # c062f53cf83401e671822003d63c1828print("sentence_groups",sentence_groups)
|
|
|
|
+ sentence_title = sentence_groups[0][0]
|
|
|
|
+ sentence_title_text = sentence_groups[0][1]
|
|
|
|
+ title_index = sentence_groups[-2][1]
|
|
|
|
+ title_before = sentence_groups[1][1].replace("(","(").replace(":",":").replace(":",";").replace(",",".").replace(",",".").replace("、",".")
|
|
|
|
+ title_after = sentence_groups[-1][1].replace(")",")").replace(":",":").replace(":",";").replace(",",".").replace(",",".").replace("、",".")
|
|
|
|
+ next_index = self.get_next_title(title_index)
|
|
|
|
+ if title_before not in dict_before:
|
|
|
|
+ dict_before[title_before] = 0
|
|
|
|
+ dict_before[title_before] += 1
|
|
|
|
+ return dict_before
|
|
|
|
+
|
|
|
|
+
|
|
def buildParsetree(self,list_obj,auto_merge_table=True):
|
|
def buildParsetree(self,list_obj,auto_merge_table=True):
|
|
|
|
|
|
self.parseTree = None
|
|
self.parseTree = None
|
|
@@ -245,6 +295,7 @@ class ParseDocument():
|
|
last_table_index = None
|
|
last_table_index = None
|
|
last_table_columns = None
|
|
last_table_columns = None
|
|
last_table = None
|
|
last_table = None
|
|
|
|
+ dict_before = self.count_title_before(list_obj)
|
|
for obj_i in range(len(list_obj)):
|
|
for obj_i in range(len(list_obj)):
|
|
obj = list_obj[obj_i]
|
|
obj = list_obj[obj_i]
|
|
_type = "sentence"
|
|
_type = "sentence"
|
|
@@ -272,12 +323,16 @@ class ParseDocument():
|
|
sentence_groups = self.find_title_by_pattern(_text[:10])
|
|
sentence_groups = self.find_title_by_pattern(_text[:10])
|
|
if sentence_groups:
|
|
if sentence_groups:
|
|
# c062f53cf83401e671822003d63c1828print("sentence_groups",sentence_groups)
|
|
# c062f53cf83401e671822003d63c1828print("sentence_groups",sentence_groups)
|
|
- sentence_title = sentence_groups[0][0]
|
|
|
|
- sentence_title_text = sentence_groups[0][1]
|
|
|
|
- title_index = sentence_groups[-2][1]
|
|
|
|
- title_before = sentence_groups[1][1]
|
|
|
|
- title_after = sentence_groups[-1][1]
|
|
|
|
- next_index = self.get_next_title(title_index)
|
|
|
|
|
|
+ title_before = sentence_groups[1][1].replace("(","(").replace(":",":").replace(":",";").replace(",",".").replace(",",".").replace("、",".").replace(".",".")
|
|
|
|
+ if title_before in dict_before and dict_before[title_before]>1:
|
|
|
|
+ sentence_title = sentence_groups[0][0]
|
|
|
|
+ sentence_title_text = sentence_groups[0][1]
|
|
|
|
+ title_index = sentence_groups[-2][1]
|
|
|
|
+
|
|
|
|
+ title_after = sentence_groups[-1][1].replace(")",")").replace(":",":").replace(":",";").replace(",",".").replace(",",".").replace("、",".").replace(".",".")
|
|
|
|
+ next_index = self.get_next_title(title_index)
|
|
|
|
+ else:
|
|
|
|
+ title_before = None
|
|
|
|
|
|
if _type=="sentence":
|
|
if _type=="sentence":
|
|
if sentence_title is None and len(list_data)>0 and list_data[-1]["sentence_title"] is not None and list_data[-1]["line_width"]>=max_length*0.6:
|
|
if sentence_title is None and len(list_data)>0 and list_data[-1]["sentence_title"] is not None and list_data[-1]["line_width"]>=max_length*0.6:
|
|
@@ -298,7 +353,7 @@ class ParseDocument():
|
|
table_columns = len(list_table[0])
|
|
table_columns = len(list_table[0])
|
|
|
|
|
|
if auto_merge_table:
|
|
if auto_merge_table:
|
|
- if last_table_index is not None and abs(obj_i-last_table_index)<=1 and last_table_columns is not None and last_table_columns==table_columns:
|
|
|
|
|
|
+ if last_table_index is not None and abs(obj_i-last_table_index)<=2 and last_table_columns is not None and last_table_columns==table_columns:
|
|
if last_table is not None:
|
|
if last_table is not None:
|
|
trs = getTrs(_table)
|
|
trs = getTrs(_table)
|
|
last_tbody = BeautifulSoup(last_table["text"],"lxml")
|
|
last_tbody = BeautifulSoup(last_table["text"],"lxml")
|
|
@@ -340,14 +395,9 @@ class ParseDocument():
|
|
else:
|
|
else:
|
|
_find = False
|
|
_find = False
|
|
for i in range(1,len(list_data)+1):
|
|
for i in range(1,len(list_data)+1):
|
|
- _d = list_data[-i]
|
|
|
|
- if i==1 and _d.get("sentence_title")==sentence_title and title_before==_d["title_before"] and title_after==_d["title_after"]:
|
|
|
|
- _data["parent_title"] = _d["parent_title"]
|
|
|
|
- _d["title_next"] = _data
|
|
|
|
- if _d["parent_title"] is not None:
|
|
|
|
- _d["parent_title"]["child_title"].append(_data)
|
|
|
|
- _find = True
|
|
|
|
|
|
+ if _find:
|
|
break
|
|
break
|
|
|
|
+ _d = list_data[-i]
|
|
if _d.get("sentence_title")==sentence_title and title_before==_d["title_before"] and title_after==_d["title_after"]:
|
|
if _d.get("sentence_title")==sentence_title and title_before==_d["title_before"] and title_after==_d["title_after"]:
|
|
if _d["next_index"]==title_index and _d["title_next"] is None:
|
|
if _d["next_index"]==title_index and _d["title_next"] is None:
|
|
_data["parent_title"] = _d["parent_title"]
|
|
_data["parent_title"] = _d["parent_title"]
|
|
@@ -356,6 +406,18 @@ class ParseDocument():
|
|
_d["parent_title"]["child_title"].append(_data)
|
|
_d["parent_title"]["child_title"].append(_data)
|
|
_find = True
|
|
_find = True
|
|
break
|
|
break
|
|
|
|
+ for i in range(1,len(list_data)+1):
|
|
|
|
+ if _find:
|
|
|
|
+ break
|
|
|
|
+ _d = list_data[-i]
|
|
|
|
+ if i==1 and _d.get("sentence_title")==sentence_title and title_before==_d["title_before"] and title_after==_d["title_after"]:
|
|
|
|
+ _data["parent_title"] = _d["parent_title"]
|
|
|
|
+ _d["title_next"] = _data
|
|
|
|
+ if _d["parent_title"] is not None:
|
|
|
|
+ _d["parent_title"]["child_title"].append(_data)
|
|
|
|
+ _find = True
|
|
|
|
+ break
|
|
|
|
+
|
|
if not _find:
|
|
if not _find:
|
|
if len(list_data)>0:
|
|
if len(list_data)>0:
|
|
for i in range(1,len(list_data)+1):
|
|
for i in range(1,len(list_data)+1):
|
|
@@ -594,8 +656,9 @@ def extract_parameters_by_table(_product,_param_pattern,list_data,_data_i,list_r
|
|
list_result.append(_cell[0])
|
|
list_result.append(_cell[0])
|
|
|
|
|
|
def extract_product_parameters(list_data,_product):
|
|
def extract_product_parameters(list_data,_product):
|
|
- _param_pattern = "配置要求|技术要求|技术参数|具体参数|规格参数|参数要求|技术需求|配置清单|(质量|技术).{,10}要求|明细及参数|验收标准|^参数$"
|
|
|
|
|
|
+ _param_pattern = "产品配置|配置要求|技术要求|技术参数|参数指标|具体参数|规格参数|参数要求|技术需求|配置清单|(质量|技术).{,10}要求|明细及参数|验收标准|^参数$"
|
|
list_result = []
|
|
list_result = []
|
|
|
|
+ _product = _product.strip()
|
|
products = extract_products(list_data,_product)
|
|
products = extract_products(list_data,_product)
|
|
|
|
|
|
_product = get_correct_product(_product,products)
|
|
_product = get_correct_product(_product,products)
|
|
@@ -633,19 +696,34 @@ def extract_product_parameters(list_data,_product):
|
|
# print("result%d"%i,list_result[i])
|
|
# print("result%d"%i,list_result[i])
|
|
list_result.sort(key=lambda x:len(re.findall('[^.][0-9a-zA-Z]+[^.]',x)), reverse=True)
|
|
list_result.sort(key=lambda x:len(re.findall('[^.][0-9a-zA-Z]+[^.]',x)), reverse=True)
|
|
|
|
|
|
- return list_result[0] if len(list_result)>0 else None
|
|
|
|
|
|
+ print("+++++++++++++++++++++")
|
|
|
|
+ for i in range(len(list_result)):
|
|
|
|
+ print("result%d"%i,list_result[i])
|
|
|
|
+ print("+++++++++++++++++++++")
|
|
|
|
+
|
|
|
|
+ for _result in list_result:
|
|
|
|
+ _check = True
|
|
|
|
+ for p in products:
|
|
|
|
+ if _result.find(p)>0 and not is_similar(_product,p,80):
|
|
|
|
+ _check = False
|
|
|
|
+ if len(_result)<10:
|
|
|
|
+ _check = False
|
|
|
|
+ if _check:
|
|
|
|
+ return _result
|
|
|
|
+
|
|
|
|
+ return None
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
|
|
|
|
- _html = open("download/7421e0c9d12dc6290ead4040df0e3cd0.html", "r", encoding="utf8").read()
|
|
|
|
|
|
+ _html = open("download/107015f8e994683fd88827ad209f0d13.html", "r", encoding="utf8").read()
|
|
|
|
|
|
- pd = ParseDocument(_html)
|
|
|
|
|
|
+ pd = ParseDocument(_html,True)
|
|
|
|
|
|
list_data = pd.tree
|
|
list_data = pd.tree
|
|
pd.print_tree(list_data)
|
|
pd.print_tree(list_data)
|
|
|
|
|
|
|
|
|
|
- _text = extract_product_parameters(list_data,"4K高清摄像系统")
|
|
|
|
|
|
+ _text = extract_product_parameters(list_data,"CT")
|
|
print("extract_text",_text)
|
|
print("extract_text",_text)
|
|
|
|
|