2 lat temu · 2f7b27c3bb
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -2325,7 +2325,7 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
 
				                             _text += sentences[idx]
			
 
				                 _outline.outline_text = _text
			
 
				                 _outline_summary = re.split("[:：，]",_text,1)[0]
			
 
				-                if len(_outline_summary)<20:
			
 
				+                if len(_outline_summary)<30:
			
 
				                     _outline.outline_summary = _outline_summary
			
 
				                 # print(_outline.outline_index,_outline.outline_text)
			
 
				 
			
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -1508,7 +1508,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				                     for one_phone in _phone:
			
 
				                         PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
			
 
				                         agency_phone.add(one_phone)
			
 
				-
			
 
				     # 正则提取电话号码实体
			
 
				     # key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
			
 
				     phone = re.compile('1[3-9][0-9][-—－―]?\d{4}[-—－―]?\d{4}|'
			
@@ -1530,6 +1529,9 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				     code_entitys = [ent for ent in list_entity if ent.entity_type=='code']
			
 
				     for _sentence in list_sentence:
			
 
				         sentence_text = _sentence.sentence_text
			
 
				+        # 过长数字串直接过滤替换
			
 
				+        for _re in re.findall("\d{50,}",sentence_text):
			
 
				+            sentence_text = sentence_text.replace(_re,"#"*len(_re))
			
 
				         in_attachment = _sentence.in_attachment
			
 
				         list_tokenbegin = []
			
 
				         begin = 0
			
@@ -1556,6 +1558,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				                 continue
			
 
				             res_set.add((i.group(), i.start(), i.end()))
			
 
				         res_set = sorted(list(res_set),key=lambda x:x[1])
			
 
				+        # 限制数量，防止异常数据处理时间过长
			
 
				+        res_set = res_set[:200]
			
 
				         last_phone_mask = True
			
 
				         error_numStr_index = []
			
 
				         sentence_phone_list = []
			
@@ -2061,7 +2065,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				                             for _p in person_phone:
			
 
				                                 if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and per.entity_text not in winter_contact:
			
 
				                                     PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
			
 
				-
			
 
				     re_split = re.compile("[^\u4e00-\u9fa5、](十一|十二|十三|十四|十五|一|二|三|四|五|六|七|八|九|十)、")
			
 
				     split_list = [0] * 16
			
 
				     split_dict = {
			
@@ -2418,7 +2421,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				                         prepare_link.append(after_entity)
			
 
				                         last_person = after_entity
			
 
				                         continue
			
 
				-
			
 
				     # 统一同类角色的属性
			
 
				     for k in PackDict.keys():
			
 
				         for i in range(len(PackDict[k]["roleList"])):
			
@@ -3090,7 +3092,9 @@ def getTimeAttributes(list_entity,list_sentence):
 
				         'time_earnestMoneyStart': [], #10 保证金递交开始时间（保证金递交时间）
			
 
				         'time_earnestMoneyEnd': [] , # 11 保证金递交截止时间
			
 
				         'time_commencement':[] , #13 开工日期
			
 
				-        'time_completion': []  # 14 竣工日期
			
 
				+        'time_completion': [],  # 14 竣工日期
			
 
				+        'time_listingStart': [],  # 15 挂牌开始日期（挂牌时间）
			
 
				+        'time_listingEnd': []  # 16 挂牌结束日期、挂牌截止日期
			
 
				     }
			
 
				     last_sentence_index = 0
			
 
				     last_time_type = ""
			
@@ -3101,22 +3105,49 @@ def getTimeAttributes(list_entity,list_sentence):
 
				         'time_registrationStart':"time_registrationEnd",
			
 
				         'time_earnestMoneyStart':"time_earnestMoneyEnd",
			
 
				         'time_commencement':"time_completion",
			
 
				+        'time_listingStart':"time_listingEnd"
			
 
				     }
			
 
				     for entity in time_entitys:
			
 
				         sentence_text = list_sentence[entity.sentence_index].sentence_text
			
 
				         entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin]
			
 
				+        entity_left2 = sentence_text[max(0, entity.wordOffset_begin - 10):entity.wordOffset_begin]
			
 
				         entity_right = sentence_text[entity.wordOffset_end:entity.wordOffset_end + 3]
			
 
				         label_prob = entity.values[entity.label]
			
 
				         entity_text = entity.entity_text
			
 
				         in_attachment = entity.in_attachment
			
 
				         extract_time = my_timeFormat(entity_text)
			
 
				         if extract_time:
			
 
				+            # 2022/12/12 新增挂牌时间正则
			
 
				+            if re.search("挂牌.{,4}(?:时间|日期)",entity_left2):
			
 
				+                if re.search("挂牌.{,4}(?:时间|日期)",entity_left2).end()>len(entity_left2)/2:
			
 
				+                    if len(extract_time) == 1:
			
 
				+                        if re.search("挂牌.?(开始|起始).?(?:时间|日期)",entity_left2):
			
 
				+                            dict_time['time_listingStart'].append((extract_time[0], 0.5, in_attachment))
			
 
				+                            last_time_type = 'time_listingStart'
			
 
				+                        elif re.search("挂牌.?(截[止至]|结束).?(?:时间|日期)",entity_left2):
			
 
				+                            dict_time['time_listingEnd'].append((extract_time[0], 0.5, in_attachment))
			
 
				+                            last_time_type = 'time_listingEnd'
			
 
				+                        elif re.search("挂牌.?(?:时间|日期)",entity_left2):
			
 
				+                            if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
			
 
				+                                dict_time['time_listingEnd'].append((extract_time[0], 0.5, in_attachment))
			
 
				+                                last_time_type = 'time_listingEnd'
			
 
				+                            else:
			
 
				+                                dict_time['time_listingStart'].append((extract_time[0], 0.5, in_attachment))
			
 
				+                                last_time_type = 'time_listingStart'
			
 
				+                    else:
			
 
				+                        dict_time['time_listingStart'].append((extract_time[0], 0.5, in_attachment))
			
 
				+                        dict_time['time_listingEnd'].append((extract_time[1], 0.5, in_attachment))
			
 
				+                        last_time_type = ''
			
 
				+                    last_sentence_index = entity.sentence_index
			
 
				+                    continue
			
 
				+
			
 
				             if re.search("至|到", entity_left):
			
 
				                 if entity.sentence_index == last_sentence_index:
			
 
				                     time_type = last_time_index.get(last_time_type)
			
 
				                     if time_type:
			
 
				                         dict_time[time_type].append((extract_time[0], 0.5 + label_prob / 10,in_attachment))
			
 
				                         last_time_type = ""
			
 
				+                        last_sentence_index = entity.sentence_index
			
 
				                         continue
			
 
				             if entity.label!=0:
			
 
				                 if entity.label==1 and label_prob>0.5:
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -2801,16 +2801,19 @@ class ProductAttributesPredictor():
 
				         list_outline = list_outlines[0]
			
 
				         get_product_attrs = False
			
 
				         for _outline in list_outline:
			
 
				-            if re.search("信息|情况|清单",_outline.outline_summary):
			
 
				+            if re.search("信息|情况|清单|概况",_outline.outline_summary):
			
 
				                 outline_text = _outline.outline_text
			
 
				                 outline_text = outline_text.replace(_outline.outline_summary,"")
			
 
				                 key_value_list = [_split for _split in re.split("[，。；]",outline_text) if re.search("[：:]",_split)]
			
 
				+                if not key_value_list:
			
 
				+                    continue
			
 
				                 head_list = []
			
 
				                 head_value_list = []
			
 
				                 for key_value in key_value_list:
			
 
				                     key_value = re.sub("^[一二三四五六七八九十]{1,3}[、.]|^[\d]{1,2}[、.]\d{,2}|^[\(（]?[一二三四五六七八九十]{1,3}[\)）][、]?","",key_value)
			
 
				                     temp = re.split("[:：]",key_value)
			
 
				                     key = temp[-2]
			
 
				+                    key = re.sub("^[一二三四五六七八九十]{1,3}[、.]|^[\d]{1,2}[、.]\d{,2}|^[\(（]?[一二三四五六七八九十]{1,3}[\)）][、]?","",key)
			
 
				                     value = temp[-1]
			
 
				                     head_list.append(key)
			
 
				                     head_value_list.append(value)
			
@@ -2840,6 +2843,7 @@ class ProductAttributesPredictor():
 
				                             tmp_head_list = head_list[begin_list[idx]:begin_list[idx+1]]
			
 
				                         product = ""  # 产品
			
 
				                         quantity = ""  # 数量
			
 
				+                        quantity_unit = "" # 单位
			
 
				                         unitPrice = ""  # 单价
			
 
				                         brand = ""  # 品牌
			
 
				                         specs = ""  # 规格
			
@@ -2857,6 +2861,7 @@ class ProductAttributesPredictor():
 
				                             # print('header_dic: ',header_dic)
			
 
				                             id1 = header_dic.get('名称', "")
			
 
				                             id2 = header_dic.get('数量', "")
			
 
				+                            id2_2 = header_dic.get('单位', "")
			
 
				                             id3 = header_dic.get('单价', "")
			
 
				                             id4 = header_dic.get('品牌', "")
			
 
				                             id5 = header_dic.get('规格', "")
			
@@ -2870,8 +2875,25 @@ class ProductAttributesPredictor():
 
				                                 if id2 != "":
			
 
				                                     if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
			
 
				                                         quantity = deal_list[id2]
			
 
				+                                        quantity = re.sub('[()（）,，约]', '', quantity)
			
 
				+                                        quantity = re.sub('[一壹]', '1', quantity)
			
 
				+                                        ser = re.search('^(\d+(?:\.\d+)?)([㎡\w/]{,5})', quantity)
			
 
				+                                        if ser:
			
 
				+                                            quantity = str(ser.group(1))
			
 
				+                                            quantity_unit = ser.group(2)
			
 
				+                                        else:
			
 
				+                                            quantity = ""
			
 
				+                                            quantity_unit = ""
			
 
				+                                if id2_2 != "":
			
 
				+                                    if re.search('^\w{1,4}$', deal_list[id2_2]):
			
 
				+                                        quantity_unit = deal_list[id2_2]
			
 
				                                     else:
			
 
				-                                        quantity = ""
			
 
				+                                        quantity_unit = ""
			
 
				+                                # if id2 != "":
			
 
				+                                #     if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
			
 
				+                                #         quantity = deal_list[id2]
			
 
				+                                #     else:
			
 
				+                                #         quantity = ""
			
 
				                                 if id3 != "":
			
 
				                                     if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id3]):
			
 
				                                         _unitPrice = deal_list[id3]
			
@@ -2912,7 +2934,7 @@ class ProductAttributesPredictor():
 
				                                         order_begin, order_end = self.fix_time(order_time, html, page_time)
			
 
				                                 # print(quantity,unitPrice,brand,specs)
			
 
				                                 if quantity != "" or unitPrice != "" or brand != "" or specs != "":
			
 
				-                                    link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
			
 
				+                                    link = {'product': product, 'quantity': quantity, 'quantity_unit':quantity_unit,'unitPrice': unitPrice,
			
 
				                                             'brand': brand[:50], 'specs': specs}
			
 
				                                     if link not in product_link:
			
 
				                                         product_link.append(link)