Przeglądaj źródła

新增挂牌时间正则提取,产品数量提取等

znj 2 lat temu
rodzic
commit
2f7b27c3bb

+ 1 - 1
BiddingKG/dl/interface/Preprocessing.py

@@ -2325,7 +2325,7 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
                             _text += sentences[idx]
                 _outline.outline_text = _text
                 _outline_summary = re.split("[::,]",_text,1)[0]
-                if len(_outline_summary)<20:
+                if len(_outline_summary)<30:
                     _outline.outline_summary = _outline_summary
                 # print(_outline.outline_index,_outline.outline_text)
 

+ 35 - 4
BiddingKG/dl/interface/getAttributes.py

@@ -1508,7 +1508,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                     for one_phone in _phone:
                         PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
                         agency_phone.add(one_phone)
-
     # 正则提取电话号码实体
     # key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
     phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
@@ -1530,6 +1529,9 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     code_entitys = [ent for ent in list_entity if ent.entity_type=='code']
     for _sentence in list_sentence:
         sentence_text = _sentence.sentence_text
+        # 过长数字串直接过滤替换
+        for _re in re.findall("\d{50,}",sentence_text):
+            sentence_text = sentence_text.replace(_re,"#"*len(_re))
         in_attachment = _sentence.in_attachment
         list_tokenbegin = []
         begin = 0
@@ -1556,6 +1558,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                 continue
             res_set.add((i.group(), i.start(), i.end()))
         res_set = sorted(list(res_set),key=lambda x:x[1])
+        # 限制数量,防止异常数据处理时间过长
+        res_set = res_set[:200]
         last_phone_mask = True
         error_numStr_index = []
         sentence_phone_list = []
@@ -2061,7 +2065,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                             for _p in person_phone:
                                 if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and per.entity_text not in winter_contact:
                                     PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
-
     re_split = re.compile("[^\u4e00-\u9fa5、](十一|十二|十三|十四|十五|一|二|三|四|五|六|七|八|九|十)、")
     split_list = [0] * 16
     split_dict = {
@@ -2418,7 +2421,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                         prepare_link.append(after_entity)
                         last_person = after_entity
                         continue
-
     # 统一同类角色的属性
     for k in PackDict.keys():
         for i in range(len(PackDict[k]["roleList"])):
@@ -3090,7 +3092,9 @@ def getTimeAttributes(list_entity,list_sentence):
         'time_earnestMoneyStart': [], #10 保证金递交开始时间(保证金递交时间)
         'time_earnestMoneyEnd': [] , # 11 保证金递交截止时间
         'time_commencement':[] , #13 开工日期
-        'time_completion': []  # 14 竣工日期
+        'time_completion': [],  # 14 竣工日期
+        'time_listingStart': [],  # 15 挂牌开始日期(挂牌时间)
+        'time_listingEnd': []  # 16 挂牌结束日期、挂牌截止日期
     }
     last_sentence_index = 0
     last_time_type = ""
@@ -3101,22 +3105,49 @@ def getTimeAttributes(list_entity,list_sentence):
         'time_registrationStart':"time_registrationEnd",
         'time_earnestMoneyStart':"time_earnestMoneyEnd",
         'time_commencement':"time_completion",
+        'time_listingStart':"time_listingEnd"
     }
     for entity in time_entitys:
         sentence_text = list_sentence[entity.sentence_index].sentence_text
         entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin]
+        entity_left2 = sentence_text[max(0, entity.wordOffset_begin - 10):entity.wordOffset_begin]
         entity_right = sentence_text[entity.wordOffset_end:entity.wordOffset_end + 3]
         label_prob = entity.values[entity.label]
         entity_text = entity.entity_text
         in_attachment = entity.in_attachment
         extract_time = my_timeFormat(entity_text)
         if extract_time:
+            # 2022/12/12 新增挂牌时间正则
+            if re.search("挂牌.{,4}(?:时间|日期)",entity_left2):
+                if re.search("挂牌.{,4}(?:时间|日期)",entity_left2).end()>len(entity_left2)/2:
+                    if len(extract_time) == 1:
+                        if re.search("挂牌.?(开始|起始).?(?:时间|日期)",entity_left2):
+                            dict_time['time_listingStart'].append((extract_time[0], 0.5, in_attachment))
+                            last_time_type = 'time_listingStart'
+                        elif re.search("挂牌.?(截[止至]|结束).?(?:时间|日期)",entity_left2):
+                            dict_time['time_listingEnd'].append((extract_time[0], 0.5, in_attachment))
+                            last_time_type = 'time_listingEnd'
+                        elif re.search("挂牌.?(?:时间|日期)",entity_left2):
+                            if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
+                                dict_time['time_listingEnd'].append((extract_time[0], 0.5, in_attachment))
+                                last_time_type = 'time_listingEnd'
+                            else:
+                                dict_time['time_listingStart'].append((extract_time[0], 0.5, in_attachment))
+                                last_time_type = 'time_listingStart'
+                    else:
+                        dict_time['time_listingStart'].append((extract_time[0], 0.5, in_attachment))
+                        dict_time['time_listingEnd'].append((extract_time[1], 0.5, in_attachment))
+                        last_time_type = ''
+                    last_sentence_index = entity.sentence_index
+                    continue
+
             if re.search("至|到", entity_left):
                 if entity.sentence_index == last_sentence_index:
                     time_type = last_time_index.get(last_time_type)
                     if time_type:
                         dict_time[time_type].append((extract_time[0], 0.5 + label_prob / 10,in_attachment))
                         last_time_type = ""
+                        last_sentence_index = entity.sentence_index
                         continue
             if entity.label!=0:
                 if entity.label==1 and label_prob>0.5:

+ 25 - 3
BiddingKG/dl/interface/predictor.py

@@ -2801,16 +2801,19 @@ class ProductAttributesPredictor():
         list_outline = list_outlines[0]
         get_product_attrs = False
         for _outline in list_outline:
-            if re.search("信息|情况|清单",_outline.outline_summary):
+            if re.search("信息|情况|清单|概况",_outline.outline_summary):
                 outline_text = _outline.outline_text
                 outline_text = outline_text.replace(_outline.outline_summary,"")
                 key_value_list = [_split for _split in re.split("[,。;]",outline_text) if re.search("[::]",_split)]
+                if not key_value_list:
+                    continue
                 head_list = []
                 head_value_list = []
                 for key_value in key_value_list:
                     key_value = re.sub("^[一二三四五六七八九十]{1,3}[、.]|^[\d]{1,2}[、.]\d{,2}|^[\((]?[一二三四五六七八九十]{1,3}[\))][、]?","",key_value)
                     temp = re.split("[::]",key_value)
                     key = temp[-2]
+                    key = re.sub("^[一二三四五六七八九十]{1,3}[、.]|^[\d]{1,2}[、.]\d{,2}|^[\((]?[一二三四五六七八九十]{1,3}[\))][、]?","",key)
                     value = temp[-1]
                     head_list.append(key)
                     head_value_list.append(value)
@@ -2840,6 +2843,7 @@ class ProductAttributesPredictor():
                             tmp_head_list = head_list[begin_list[idx]:begin_list[idx+1]]
                         product = ""  # 产品
                         quantity = ""  # 数量
+                        quantity_unit = "" # 单位
                         unitPrice = ""  # 单价
                         brand = ""  # 品牌
                         specs = ""  # 规格
@@ -2857,6 +2861,7 @@ class ProductAttributesPredictor():
                             # print('header_dic: ',header_dic)
                             id1 = header_dic.get('名称', "")
                             id2 = header_dic.get('数量', "")
+                            id2_2 = header_dic.get('单位', "")
                             id3 = header_dic.get('单价', "")
                             id4 = header_dic.get('品牌', "")
                             id5 = header_dic.get('规格', "")
@@ -2870,8 +2875,25 @@ class ProductAttributesPredictor():
                                 if id2 != "":
                                     if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
                                         quantity = deal_list[id2]
+                                        quantity = re.sub('[()(),,约]', '', quantity)
+                                        quantity = re.sub('[一壹]', '1', quantity)
+                                        ser = re.search('^(\d+(?:\.\d+)?)([㎡\w/]{,5})', quantity)
+                                        if ser:
+                                            quantity = str(ser.group(1))
+                                            quantity_unit = ser.group(2)
+                                        else:
+                                            quantity = ""
+                                            quantity_unit = ""
+                                if id2_2 != "":
+                                    if re.search('^\w{1,4}$', deal_list[id2_2]):
+                                        quantity_unit = deal_list[id2_2]
                                     else:
-                                        quantity = ""
+                                        quantity_unit = ""
+                                # if id2 != "":
+                                #     if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
+                                #         quantity = deal_list[id2]
+                                #     else:
+                                #         quantity = ""
                                 if id3 != "":
                                     if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id3]):
                                         _unitPrice = deal_list[id3]
@@ -2912,7 +2934,7 @@ class ProductAttributesPredictor():
                                         order_begin, order_end = self.fix_time(order_time, html, page_time)
                                 # print(quantity,unitPrice,brand,specs)
                                 if quantity != "" or unitPrice != "" or brand != "" or specs != "":
-                                    link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
+                                    link = {'product': product, 'quantity': quantity, 'quantity_unit':quantity_unit,'unitPrice': unitPrice,
                                             'brand': brand[:50], 'specs': specs}
                                     if link not in product_link:
                                         product_link.append(link)