瀏覽代碼

公告文本中逗号修正,时间修正

znj 3 年之前
父節點
當前提交
e8af8fdab3

+ 16 - 7
BiddingKG/dl/interface/Preprocessing.py

@@ -419,7 +419,7 @@ def tableToText(soup):
             else:
             else:
                 inner_table[_h][_w][1] = 0
                 inner_table[_h][_w][1] = 0
                 _item = inner_table[_h][_w][0]
                 _item = inner_table[_h][_w][0]
-                if re.search(pat_head,_item) is not None and len(item)<8:
+                if re.search(pat_head,_item) is not None and len(_item)<8:
                     inner_table[_h][_w][1] = 1
                     inner_table[_h][_w][1] = 1
 
 
         # print("=====")
         # print("=====")
@@ -430,7 +430,7 @@ def tableToText(soup):
         width = len(inner_table[0])
         width = len(inner_table[0])
         for i in range(height):
         for i in range(height):
             for j in range(width):
             for j in range(width):
-                if re.search("[::]$", inner_table[i][j][0]):
+                if re.search("[::]$", inner_table[i][j][0]) and len(inner_table[i][j][0])<8:
                     inner_table[i][j][1] = 1
                     inner_table[i][j][1] = 1
 
 
         repairTable(inner_table)
         repairTable(inner_table)
@@ -1472,15 +1472,24 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         start_time = time.time()
         start_time = time.time()
         article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
         article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
         # 修正被","逗号分隔的时间
         # 修正被","逗号分隔的时间
-        repair_time = re.compile("20,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?:,?[0-6]\d,?:,?[0-6]\d|"
-                                 "20,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?[:时点],?[0-6]\d,?分|"
-                                 "20,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?[时点]|"
-                                 "20,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?|"
+        repair_time = re.compile("[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?:,?[0-6]\d,?:,?[0-6]\d|"
+                                 "[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?[:时点],?[0-6]\d,?分?|"
+                                 "[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?[时点]|"
+                                 "[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?|"
                                  "[0-2]?\d,?:,?[0-6]\d,?:,?[0-6]\d"
                                  "[0-2]?\d,?:,?[0-6]\d,?:,?[0-6]\d"
                                  )
                                  )
         for _time in set(re.findall(repair_time,article_processed)):
         for _time in set(re.findall(repair_time,article_processed)):
             if re.search(",",_time):
             if re.search(",",_time):
-                article_processed = article_processed.replace(_time,re.sub(",","",_time))
+                _time2 = re.sub(",", "", _time)
+                item = re.search("[12]\d{3}[-—-―/][0-1]?\d[-—-―/][0-3]\d(?=\d)", _time2)
+                if item:
+                    _time2 = _time2.replace(item.group(),item.group() + " ")
+                article_processed = article_processed.replace(_time, _time2)
+            else:
+                item = re.search("[12]\d{3}[-—-―/][0-1]?\d[-—-―/][0-3]\d(?=\d)", _time)
+                if item:
+                    _time2 = _time.replace(item.group(),item.group() + " ")
+                    article_processed = article_processed.replace(_time, _time2)
         # print('re_rtime',re.findall(repair_time,article_processed))
         # print('re_rtime',re.findall(repair_time,article_processed))
         # log(article_processed)
         # log(article_processed)
 
 

+ 1 - 1
BiddingKG/dl/interface/getAttributes.py

@@ -941,6 +941,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
     
     
     ''''''
     ''''''
     # 通过模型分类的招标/代理联系人
     # 通过模型分类的招标/代理联系人
+    list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index)
     person_list = [entity for entity in list_entity if entity.entity_type == 'person' and entity.label in [1, 2]]
     person_list = [entity for entity in list_entity if entity.entity_type == 'person' and entity.label in [1, 2]]
     tenderee_contact = set()
     tenderee_contact = set()
     tenderee_phone = set()
     tenderee_phone = set()
@@ -1259,7 +1260,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
         "十四、": 14,
         "十四、": 14,
         "十五、": 15
         "十五、": 15
     }
     }
-    list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index)
 
 
     for item in re.finditer(re_split, _content):
     for item in re.finditer(re_split, _content):
         _index = split_dict.get(item.group()[1:])
         _index = split_dict.get(item.group()[1:])

+ 3 - 0
BiddingKG/dl/test/测试整个要素提取流程.py

@@ -121,6 +121,9 @@ def predict(doc_id,text):
                 print("联系方式:",end=' ')
                 print("联系方式:",end=' ')
                 print(entity.entity_text,[i.entity_text for i in entity.person_phone] if entity.person_phone else None,entity.label,entity.values)
                 print(entity.entity_text,[i.entity_text for i in entity.person_phone] if entity.person_phone else None,entity.label,entity.values)
                 print(entity.sentence_index)
                 print(entity.sentence_index)
+            elif entity.entity_type=="time":
+                print("time:",end=" ")
+                print(entity.entity_text)
             elif entity.entity_type in ['org','company']:
             elif entity.entity_type in ['org','company']:
                 _sentence = list_sentences[0][entity.sentence_index]
                 _sentence = list_sentences[0][entity.sentence_index]
                 if entity.pointer_person:
                 if entity.pointer_person: