Przeglądaj źródła

附件图片表格对应、表格处理后文本大小限制

znj 2 lat temu
rodzic
commit
13b5fca834

+ 2 - 1
BiddingKG/dl/interface/Entitys.py

@@ -296,6 +296,7 @@ class Role():
         # 中投标人属性
         self.ratio = "" #2022/01/06 新增 保存中投标金额相关费率
         self.serviceTime = "" #2021/01/06 新增 保存服务期限(工期)
+        self.address = ""  #2022/08/08 新增 角色地址
 
     def getString(self):
         self.linklist = [item for item in set(self.linklist)]
@@ -319,7 +320,7 @@ class Role():
                 discount_ratio = num_value
         result = {'role_name':self.role_name,'role_text':fitDataByRule(self.entity_text),
                   'role_money': {'money':self.money,'money_unit':self.money_unit,'floating_ratio':floating_ratio,'downward_floating_ratio':downward_floating_ratio,'discount_ratio':discount_ratio},
-                  'linklist': self.linklist,'serviceTime':self.serviceTime}
+                  'linklist': self.linklist,'serviceTime':self.serviceTime,'address':self.address}
         return result
 
 # 用于KM算法的组合配对

+ 60 - 6
BiddingKG/dl/interface/Preprocessing.py

@@ -1029,6 +1029,8 @@ def tableToText(soup):
 
 
             tbody.string = getTableText(inner_table,head_list)
+            table_max_len = 30000
+            tbody.string = tbody.string[:table_max_len]
             #print(tbody.string)
             tbody.name = "turntable"
             return inner_table
@@ -1256,6 +1258,7 @@ def segment(soup,final=True):
         # if child.name in spaceList:
         #     child.insert_after(" ")
     text = str(soup.get_text())
+    # print('text',text)
     #替换英文冒号为中文冒号
     text = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])",":",text)
     #替换为中文逗号
@@ -1347,6 +1350,11 @@ def segment(soup,final=True):
         text = _text
     # 附件标识前修改为句号,避免正文和附件内容混合在一起
     text = re.sub("[^。](?=##attachment##)","。",text)
+    text = re.sub("[^。](?=##attachment_begin##)","。",text)
+    text = re.sub("[^。](?=##attachment_end##)","。",text)
+    text = re.sub("##attachment_begin##。","##attachment_begin##",text)
+    text = re.sub("##attachment_end##。","##attachment_end##",text)
+
     return text
 
 '''
@@ -1829,6 +1837,7 @@ def article_limit(soup,limit_words=30000):
                 attachment_part = child
                 have_attachment = True
                 break
+
     if not have_attachment:
         # 无附件
         if len(re.sub(sub_space, "", soup.get_text())) > limit_words:
@@ -1862,9 +1871,44 @@ def article_limit(soup,limit_words=30000):
                             attachment_skip = True
                     else:
                         part.decompose()
-
     return soup
 
+def attachment_filelink(soup):
+    have_attachment = False
+    attachment_part = None
+    for child in soup.find_all(recursive=True):
+        if child.name == 'div' and 'class' in child.attrs:
+            if "richTextFetch" in child['class']:
+                attachment_part = child
+                have_attachment = True
+                break
+    if not have_attachment:
+        return soup
+    else:
+        # 附件类型:图片、表格
+        attachment_type = re.compile("\.(?:png|jpg|jpeg|tif|bmp|xlsx|xls)$")
+        attachment_dict = dict()
+        for _attachment in attachment_part.find_all(recursive=False):
+            if _attachment.name == 'div' and 'filemd5' in _attachment.attrs:
+                # print('filemd5',_attachment['filemd5'])
+                attachment_dict[_attachment['filemd5']] = _attachment
+        # print(attachment_dict)
+        for child in soup.find_all(recursive=True):
+            if child.name == 'div' and 'class' in child.attrs:
+                if "richTextFetch" in child['class']:
+                    break
+            if "filelink" in child.attrs and child['filelink'] in attachment_dict:
+                if re.search(attachment_type,str(child.string).strip()) or \
+                        ('original' in child.attrs and re.search(attachment_type,str(child['original']).strip())):
+                    # 附件插入正文标识
+                    child.insert_before("。##attachment_begin##")
+                    child.insert_after("。##attachment_end##")
+                    child.replace_with(attachment_dict[child['filelink']])
+
+        # print('格式化输出',soup.prettify())
+        return soup
+
+
 def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
     '''
     :param articles: 待处理的article source html
@@ -1909,7 +1953,10 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
                 _soup.wrap(article_processed.new_tag("span"))
         # print(article_processed)
         # 正文和附件内容限制字数30000
-        article_processed = article_limit(article_processed,limit_words=30000)
+        article_processed = article_limit(article_processed, limit_words=30000)
+        # 把每个附件识别对应的html放回原来出现的位置
+        article_processed = attachment_filelink(article_processed)
+
         article_processed = get_preprocessed_outline(article_processed)
         # print('article_processed')
         article_processed = tableToText(article_processed)
@@ -1917,6 +1964,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = article_processed.replace('.','.') # 2021/12/01 修正OCR识别PDF小数点错误问题
         article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改
         article_processed = article_processed.replace('成交工程价款', '成交工程价')  # 2021/12/21 修正为中标价
+        article_processed = re.sub('任务(?=编号[::])', '项目',article_processed)  # 2022/08/10 修正为项目编号
         # 修复OCR金额中“,”、“。”识别错误
         article_processed_list = article_processed.split("##attachment##")
         if len(article_processed_list)>1:
@@ -1967,7 +2015,6 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         if key_preprocess not in cost_time:
             cost_time[key_preprocess] = 0
         cost_time[key_preprocess] += round(time.time()-start_time,2)
-
         #article_processed = article[1]
         _article = Article(doc_id,article_processed,sourceContent,_send_doc_id,_title,
                            bidway=bidway)
@@ -2088,16 +2135,23 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
             #限流执行
             key_nerToken = "nerToken"
             start_time = time.time()
-            tokens_all = getTokens(sentences,useselffool=useselffool)
+            # tokens_all = getTokens(sentences,useselffool=useselffool)
+            tokens_all = getTokens([re.sub("##attachment_begin##|##attachment_end##","",_sen) for _sen in sentences],useselffool=useselffool)
             if key_nerToken not in cost_time:
                 cost_time[key_nerToken] = 0
             cost_time[key_nerToken] += round(time.time()-start_time,2)
 
             in_attachment = False
             for sentence_index in range(len(sentences)):
-                if sentence_index == attachment_begin_index:
-                    in_attachment = True
                 sentence_text = sentences[sentence_index]
+                if re.search("##attachment_begin##",sentence_text):
+                    in_attachment = True
+                    sentence_text = re.sub("##attachment_begin##","",sentence_text)
+                elif re.search("##attachment_end##",sentence_text):
+                    in_attachment = False
+                    sentence_text = re.sub("##attachment_end##", "", sentence_text)
+                if sentence_index >= attachment_begin_index and attachment_begin_index!=-1:
+                    in_attachment = True
                 tokens = tokens_all[sentence_index]
 
                 #pos_tag = pos_all[sentence_index]

+ 91 - 27
BiddingKG/dl/interface/getAttributes.py

@@ -1424,6 +1424,11 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                         relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data))
                     temp_data = []
             start = start + maxlen - 120
+        if temp_data:
+            deal_data += len(temp_data)
+            if deal_data <= 4:
+                for _text_data, _pre_data in temp_data:
+                    relation_list.extend(relationExtraction_model.predict(_text_data, _pre_data))
         # print("预测数据:",len(temp_data))
         # 去重结果
         relation_list = list(set(relation_list))
@@ -1514,6 +1519,53 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 PackDict["Project"]["roleList"][i].linklist.append((combo[0].entity_text,combo[1].entity_text))
                                 break
                 # print(3,combo[0].entity_text,combo[1].entity_text)
+
+        # "公司——地址" 链接规则补充
+        company_lacation_EntityList = [ent for ent in pre_entity if ent.entity_type in ['company', 'org', 'location']]
+        company_lacation_EntityList = sorted(company_lacation_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
+        t_match_list = []
+        for ent_idx in range(len(company_lacation_EntityList)):
+            entity = company_lacation_EntityList[ent_idx]
+            if entity.entity_type in ['company', 'org']:
+                match_nums = 0
+                company_nums = 0  # 经过其他公司的数量
+                location_nums = 0  # 经过电话的数量
+                for after_index in range(ent_idx + 1, min(len(company_lacation_EntityList), ent_idx + 5)):
+                    after_entity = company_lacation_EntityList[after_index]
+                    if after_entity.entity_type == "location":
+                        distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
+                                tokens_num_dict[entity.sentence_index] + entity.end_index)
+                        location_nums += 1
+                        if distance > 100 or location_nums >= 3:
+                            break
+                        sentence_distance = after_entity.sentence_index - entity.sentence_index
+                        value = (-1 / 2 * (distance ** 2)) / 10000
+                        if sentence_distance == 0:
+                            if distance < 80:
+                                t_match_list.append(Match(entity, after_entity, value))
+                                match_nums += 1
+                                if company_nums:
+                                    break
+                        else:
+                            if distance < 50:
+                                t_match_list.append(Match(entity, after_entity, value))
+                                match_nums += 1
+                                if company_nums:
+                                    break
+                    else:
+                        # type:company/org
+                        company_nums += 1
+                        if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
+                            break
+
+        # km算法分配求解
+        relate_location_result = dispatch(t_match_list)
+        relate_location_result = sorted(relate_location_result, key=lambda x: (x[0].sentence_index, x[0].begin_index))
+        for match in relate_location_result:
+            _company = match[0]
+            _relation = match[1]
+            if not _company.pointer_address:
+                _company.pointer_address = _relation
     # "联系人——联系电话" 链接规则补充
     person_phone_EntityList = [ent for ent in pre_entity+ phone_entitys if ent.entity_type not in ['company','org','location']]
     person_phone_EntityList = sorted(person_phone_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
@@ -1833,7 +1885,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                             match_list2.append(Match(entity, after_entity, value))
                                             match_nums += 1
                             if after_entity.entity_type in ['org', 'company']:
-                                if entity.label not in [2, 3, 4] and after_entity.label in [0, 1]:
+                                if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
                                     break
                                 # 解决在‘地址’中识别出org/company的问题
                                 # if entity.label in [0,1] and after_index==index+1 and after_entity.label not in [0,1]:
@@ -2072,18 +2124,17 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                         continue
 
     # 统一同类角色的属性
-    if PackDict.get("Project"):
-        for i in range(len(PackDict["Project"]["roleList"])):
-            # if PackDict["Project"]["roleList"][i].role_name in ["tenderee","agency"]:
+    for k in PackDict.keys():
+        for i in range(len(PackDict[k]["roleList"])):
             for _entity in list_entity:
                 if _entity.entity_type in ['org','company']:
                     is_same = False
                     is_similar = False
                     # entity_text相同
-                    if _entity.entity_text==PackDict["Project"]["roleList"][i].entity_text:
+                    if _entity.entity_text==PackDict[k]["roleList"][i].entity_text:
                         is_same = True
                     # entity.label为【0,1】
-                    if _entity.label in [0,1] and dict_role_id[str(_entity.label)]==PackDict["Project"]["roleList"][i].role_name:
+                    if _entity.label in [0,1] and dict_role_id[str(_entity.label)]==PackDict[k]["roleList"][i].role_name:
                         is_similar = True
                     if is_same:
                         linked_entitys = _entity.linked_entitys
@@ -2093,35 +2144,48 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 for _pointer_person in pointer_person:
                                     _phone = [p.entity_text for p in _pointer_person.person_phone] if _pointer_person.person_phone else []
                                     for _p in _phone:
-                                        if (_pointer_person.entity_text,_p) not in PackDict["Project"]["roleList"][i].linklist:
-                                            PackDict["Project"]["roleList"][i].linklist.append((_pointer_person.entity_text,_p))
+                                        if (_pointer_person.entity_text,_p) not in PackDict[k]["roleList"][i].linklist:
+                                            PackDict[k]["roleList"][i].linklist.append((_pointer_person.entity_text,_p))
                     elif is_similar:
                         pointer_person = _entity.pointer_person if _entity.pointer_person else []
                         for _pointer_person in pointer_person:
                             _phone = [p.entity_text for p in _pointer_person.person_phone] if _pointer_person.person_phone else []
                             for _p in _phone:
-                                if (_pointer_person.entity_text, _p) not in PackDict["Project"]["roleList"][i].linklist:
-                                    PackDict["Project"]["roleList"][i].linklist.append(
+                                if (_pointer_person.entity_text, _p) not in PackDict[k]["roleList"][i].linklist:
+                                    PackDict[k]["roleList"][i].linklist.append(
                                         (_pointer_person.entity_text, _p))
 
     # "roleList"中联系人电话去重
-    for i in range(len(PackDict["Project"]["roleList"])):
-        # print(123, PackDict["Project"]["roleList"][i].linklist)
-        # 带有联系人的电话
-        with_person = [person_phone[1] for person_phone in PackDict["Project"]["roleList"][i].linklist if person_phone[0]]
-        # 带有电话的联系人
-        with_phone = [person_phone[0] for person_phone in PackDict["Project"]["roleList"][i].linklist if person_phone[1]]
-        remove_list = []
-        for item in PackDict["Project"]["roleList"][i].linklist:
-            if not item[0]:
-                if item[1] in with_person:
-                    # 删除重复的无联系人电话
-                    remove_list.append(item)
-            elif not item[1]:
-                if item[0] in with_phone:
-                    remove_list.append(item)
-        for _item in remove_list:
-            PackDict["Project"]["roleList"][i].linklist.remove(_item)
+    for k in PackDict.keys():
+        for i in range(len(PackDict[k]["roleList"])):
+            # 带有联系人的电话
+            with_person = [person_phone[1] for person_phone in PackDict[k]["roleList"][i].linklist if person_phone[0]]
+            # 带有电话的联系人
+            with_phone = [person_phone[0] for person_phone in PackDict[k]["roleList"][i].linklist if person_phone[1]]
+            remove_list = []
+            for item in PackDict[k]["roleList"][i].linklist:
+                if not item[0]:
+                    if item[1] in with_person:
+                        # 删除重复的无联系人电话
+                        remove_list.append(item)
+                elif not item[1]:
+                    if item[0] in with_phone:
+                        remove_list.append(item)
+            for _item in remove_list:
+                PackDict[k]["roleList"][i].linklist.remove(_item)
+
+    # PackDict更新company/org地址
+    for ent in pre_entity:
+        if ent.entity_type in ['company','org']:
+            if ent.pointer_address:
+                for k in PackDict.keys():
+                    for i in range(len(PackDict[k]["roleList"])):
+                        if PackDict[k]["roleList"][i].entity_text == ent.entity_text:
+                            if not PackDict[k]["roleList"][i].address:
+                                PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
+                            else:
+                                if len(ent.pointer_address.entity_text) > len(PackDict[k]["roleList"][i].address):
+                                    PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
 
     # 联系人——电子邮箱链接
     temporary_list3 = [entity for entity in list_entity if entity.entity_type=='email' or (entity.entity_type=='person' and entity.label in [1,2,3])]

+ 3 - 2
BiddingKG/dl/interface/predictor.py

@@ -1500,6 +1500,7 @@ class TendereeRuleRecall():
                                         "询价(机构|企业)|联系(人|方式),?(单位|公司)(名称)?|联系(人|方式),名称)[::][^。;,]{,5}$")
 
         self.tenderee_right = re.compile("^[^。;::]{,5}[((](以?下简?称)?,?[,\"“]*[我本][\u4e00-\u9fa5]{1,2}[,\"”]*[))]|"
+                                         "^[\((][^。;::\))]{,5}称(?:招标|采购)(?:人|单位)|"
                                         "^[^。;::]{,10}[对就][^。;,]+,?[^。;,]{,20}进行[^。;,]*(采购|询比?价|遴选|招投?标|征集)|"
                                          "^[^。;::]{,10}关于[^。;,]+,?[^。;,]{,20}的[^。;,]{,20}公告|"
                                          "^[^。;,::]{,10}的[^。;,]+,?[^。;,]{,20}正在[^。;,]{,5}进行|"
@@ -1518,8 +1519,8 @@ class TendereeRuleRecall():
                                 "[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
         # 未识别实体尾部判断
         self.unrecognized_end1 = re.compile(
-            "^[\u4e00-\u9fa5]{2,}?(?:公司|医院|学校|学院|大学|中学|小学|幼儿园|政府|指挥部|办公室|项目部|业主大会|监狱|教育局|委员会|研究所|招标办|采购部|办事处|水利局|公墓|中心)")
-        self.unrecognized_end2 = re.compile("^[\u4e00-\u9fa5]{4,}(?:署|局|厅|处|室|科|部|站|所|股|行)")
+            "^[\u4e00-\u9fa5]{2,}?(?:公司|医院|学校|学院|大学|中学|小学|幼儿园|政府|指挥部|办公室|项目部|业主大会|监狱|教育局|委员会|研究所|招标办|采购部|办事处|水利局|公墓|中心|联合社|合作社)")
+        self.unrecognized_end2 = re.compile("^[\u4e00-\u9fa5]{4,}(?:署|局|厅|处|室|科|部|站|所|股|行|园)")
 
     def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
         # tenderee_notfound = True