Quellcode durchsuchen

Merge branch 'master' of http://192.168.2.65:3000/BIDI-ML/BIDI_ML_INFO_EXTRACTION

lishimin vor 3 Jahren
Ursprung
Commit
8c0dcd0129

+ 29 - 2
BiddingKG/dl/bidway/re_bidway.py

@@ -385,6 +385,32 @@ def extract_bidway(text, title):
         # print(d.get("body"), d.get("begin_index"), d.get("end_index"))
     return list_bidway
 
+bidway_dict = {'询价': '询价', '竞争性谈判': '竞争性谈判',
+               '公开比选': '其他', '国内竞争性磋商': '竞争性磋商',
+               '招标方式:t公开': '公开招标', '竞价': '竞价',
+               '竞标': '竞价', '电子竞价': '竞价',
+               '电子书面竞投': '竞价', '单一来源': '单一来源',
+               '网上竞价': '竞价', '公开招标': '公开招标',
+               '询比': '询价', '定点采购': '其他',
+               '招标方式:■公开': '公开招标', '交易其他,付款其他': '其他',
+               '竞争性评审': '竞争性磋商', '公开招租': '其他', '\\N': '',
+               '比选': '其他', '比质比价': '其他', '分散采购': '其他',
+               '内部邀标': '邀请招标', '邀请招标': '邀请招标',
+               '网上招标': '公开招标', '非定向询价': '询价',
+               '网络竞价': '竞价', '公开询价': '询价',
+               '定点采购议价': '其他', '询单': '询价',
+               '网上挂牌': '其他', '网上直购': '其他',
+               '定向询价': '询价', '采购方式:公开': '公开招标',
+               '磋商': '竞争性磋商', '公开招投标': '公开招标',
+               '招标方式:√公开': '公开招标', '公开选取': '公开招标',
+               '网上电子投标': '公开招标', '公开竞谈': '竞争性谈判',
+               '竞争性磋商': '竞争性磋商', '采购方式:邀请': '邀请招标',
+               '公开竞价': '竞价', '其他': '其他', '公开招募': '其他',
+               '网上询价': '询价'}
+# bidway名称统一规范
+def bidway_integrate(bidway):
+    integrate_name = bidway_dict.get(bidway,"其他")
+    return integrate_name
 
 def test_csv():
     df = pd.read_csv("C:\\Users\\Administrator\\Desktop\\bidway_text.csv")
@@ -441,7 +467,8 @@ def test_html():
 if __name__ == "__main__":
     # extract_bidway(s)
 
-    test_csv()
-    # test_str()
+    # test_csv()
+    test_str()
     # test_html()
+    pass
 

+ 1 - 0
BiddingKG/dl/interface/Entitys.py

@@ -165,6 +165,7 @@ class Entity():
         self.pointer_tendereeMoney = None
         # self.person_phone = person_phone
         self.person_phone = []
+        self.pointer_email = None
         self.is_tail = False
         self.notes = ''  # 2021/7/20 新增,保存金额大小写,单位等备注
         self.money_unit = '' #2021/8/17 新增,保存金额单位 元、万元 、亿元

+ 72 - 3
BiddingKG/dl/interface/Preprocessing.py

@@ -15,7 +15,8 @@ from BiddingKG.dl.interface.predictor import getPredictor
 from BiddingKG.dl.common.nerUtils import *
 from BiddingKG.dl.money.moneySource.ruleExtra import extract_moneySource
 from BiddingKG.dl.time.re_servicetime import extract_servicetime
-from BiddingKG.dl.bidway.re_bidway import extract_bidway
+from BiddingKG.dl.relation_extraction.re_email import extract_email
+from BiddingKG.dl.bidway.re_bidway import extract_bidway,bidway_integrate
 from BiddingKG.dl.fingerprint.documentFingerprint import getFingerprint
 from BiddingKG.dl.entityLink.entityLink import *
 
@@ -437,7 +438,7 @@ def tableToText(soup):
 
         repairTable(inner_table)
         head_list = sliceTable(inner_table)
-
+        # print("inner_table:",inner_table)
 
         return inner_table,head_list
                     
@@ -1078,7 +1079,7 @@ def segment(soup,final=True):
     #替换为中文分号
     text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text)
     #替换"?"为 " " ,update:2021/7/20
-    text = re.sub("?+"," ",text)
+    text = re.sub("?"," ",text)
          
 
     #替换"""为"“",否则导入deepdive出错
@@ -1488,6 +1489,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         list_bidway = extract_bidway(article_processed, _title)
         if list_bidway:
             bidway = list_bidway[0].get("body")
+            # bidway名称统一规范
+            bidway = bidway_integrate(bidway)
         else:
             bidway = ""
 
@@ -1920,6 +1923,48 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 else:
                     index += 1
 
+            # "联系人"正则补充提取  2021/11/15 新增
+            list_person_text = [entity.entity_text for entity in list_sentence_entitys if entity.entity_type=='person']
+            error_text = ['传真','网址','电子邮','联系','联系电','联系地','采购代','邮政编','邮政','电话','手机','手机号','联系人','地址','地点','邮箱','邮编','联系方','招标','招标人','代理','代理人','采购','附件']
+            list_person_text = set(list_person_text + error_text)
+            re_person = re.compile("联系人[::]([\u4e00-\u9fa5]{2,3})(?=联系)|"
+                                   "联系人[::]([\u4e00-\u9fa5]工)|"
+                                   "联系人[::]([\u4e00-\u9fa5]{2,3})")
+            list_person = []
+            for match_result in re_person.finditer(sentence_text):
+                match_text = match_result.group()
+                entity_text = match_text[4:]
+                wordOffset_begin = match_result.start() + 4
+                wordOffset_end = match_result.end()
+                # print(text[wordOffset_begin:wordOffset_end])
+                if entity_text not in list_person_text and entity_text[:2] not in list_person_text:
+                    _person = dict()
+                    _person['body'] = entity_text
+                    _person['begin_index'] = wordOffset_begin
+                    _person['end_index'] = wordOffset_end
+                    list_person.append(_person)
+            entity_type = "person"
+            for person in list_person:
+                begin_index_temp = person['begin_index']
+                for j in range(len(list_tokenbegin)):
+                    if list_tokenbegin[j] == begin_index_temp:
+                        begin_index = j
+                        break
+                    elif list_tokenbegin[j] > begin_index_temp:
+                        begin_index = j - 1
+                        break
+                index = person['end_index']
+                end_index_temp = index
+                for j in range(begin_index, len(list_tokenbegin)):
+                    if list_tokenbegin[j] >= index:
+                        end_index = j - 1
+                        break
+                entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
+                entity_text = person['body']
+                list_sentence_entitys.append(
+                    Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
+                           begin_index_temp, end_index_temp))
+
             # 资金来源提取  2020/12/30 新增
             list_moneySource = extract_moneySource(sentence_text)
             entity_type = "moneysource"
@@ -1944,6 +1989,30 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
                            begin_index_temp, end_index_temp))
 
+            # 电子邮箱提取 2021/11/04 新增
+            list_email = extract_email(sentence_text)
+            entity_type = "email" # 电子邮箱
+            for email in list_email:
+                begin_index_temp = email['begin_index']
+                for j in range(len(list_tokenbegin)):
+                    if list_tokenbegin[j] == begin_index_temp:
+                        begin_index = j
+                        break
+                    elif list_tokenbegin[j] > begin_index_temp:
+                        begin_index = j - 1
+                        break
+                index = email['end_index']
+                end_index_temp = index
+                for j in range(begin_index, len(list_tokenbegin)):
+                    if list_tokenbegin[j] >= index:
+                        end_index = j - 1
+                        break
+                entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
+                entity_text = email['body']
+                list_sentence_entitys.append(
+                    Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
+                           begin_index_temp, end_index_temp))
+
             # 服务期限提取 2020/12/30 新增
             list_servicetime = extract_servicetime(sentence_text)
             entity_type = "serviceTime"

+ 80 - 35
BiddingKG/dl/interface/getAttributes.py

@@ -1000,23 +1000,34 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
             agency_contact.add(_person.entity_text)
     # 正则匹配无 '主体/联系人' 的电话
     # 例:"采购人联系方式:0833-5226788,"
+    phone_pattern = '(1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' \
+                    '\+86.?1[3|4|5|6|7|8|9]\d{9}|' \
+                    '0[1-9]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|' \
+                    '0[1-9]\d{1,2}[-—-―]\d{7,8}.?转\d{1,4}|' \
+                    '0[1-9]\d{1,2}[-—-―]\d{7,8}[-—-―]\d{1,4}|' \
+                    '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3|4|5|6|7|8|9]\d{9})|' \
+                   '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|' \
+                   '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|' \
+                   '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|' \
+                   '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|' \
+                   '[2-9]\d{6,7})'
     re_tenderee_phone = re.compile(
         "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,5}(?:电话|联系方式|联系人)[::]?[^。]{0,7}?)"
         # 电话号码
-        "(1[3-9][0-9][-—-]?\d{4}[-—-]?\d{4}|0\d{2,3}[-—-][1-9]\d{6,7}/[1-9]\d{6,7}|0\d{2,3}[-—-][1-9]\d{6,7}转\d{1,4}|0\d{2,3}[-—-]?[1-9]\d{6,7}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|[1-9]\d{6,7})(?:[^\.]|$)")
+        + phone_pattern)
     # 例:"采购人地址和联系方式:峨边彝族自治县教育局,0833-5226788,"
     re_tenderee_phone2 = re.compile(
         "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人)[::]?[^。]{0,20}?)"
         # 电话号码
-        "(1[3-9][0-9][-—-]?\d{4}[-—-]?\d{4}|0\d{2,3}[-—-][1-9]\d{6,7}/[1-9]\d{6,7}|0\d{2,3}[-—-][1-9]\d{6,7}转\d{1,4}|0\d{2,3}[-—-]?[1-9]\d{6,7}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|[1-9]\d{6,7})(?:[^\.]|$)")
+        + phone_pattern)
     re_agent_phone = re.compile(
         "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,5}(?:电话|联系方式|联系人)[::]?[^。]{0,7}?)"
         # 电话号码
-        "(1[3-9][0-9][-—-]?\d{4}[-—-]?\d{4}|0\d{2,3}[-—-][1-9]\d{6,7}/[1-9]\d{6,7}|0\d{2,3}[-—-][1-9]\d{6,7}转\d{1,4}|0\d{2,3}[-—-]?[1-9]\d{6,7}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|[1-9]\d{6,7})(?:[^\.]|$)")
+        + phone_pattern)
     re_agent_phone2 = re.compile(
         "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人)[::]?[^。]{0,20}?)"
         # 电话号码
-        "(1[3-9][0-9][-—-]?\d{4}[-—-]?\d{4}|0\d{2,3}[-—-][1-9]\d{6,7}/[1-9]\d{6,7}|0\d{2,3}[-—-][1-9]\d{6,7}转\d{1,4}|0\d{2,3}[-—-]?[1-9]\d{6,7}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|[1-9]\d{6,7})(?:[^\.]|$)")
+        + phone_pattern)
     content = ""
     for _sentence in list_sentence:
         content += "".join(_sentence.tokens)
@@ -1036,24 +1047,32 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
             _tenderee_phone = re.findall(re_tenderee_phone, content)
             if _tenderee_phone:
                 for _phone in _tenderee_phone:
-                    PackDict["Project"]["roleList"][i].linklist.append(("", _phone))
-                    tenderee_phone.add(_phone)
+                    _phone = _phone.split("/") # 分割多个号码
+                    for one_phone in _phone:
+                        PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
+                        tenderee_phone.add(one_phone)
             _tenderee_phone2 = re.findall(re_tenderee_phone2, content)
             if _tenderee_phone2:
                 for _phone in _tenderee_phone2:
-                    PackDict["Project"]["roleList"][i].linklist.append(("", _phone))
-                    tenderee_phone.add(_phone)
+                    _phone = _phone.split("/")
+                    for one_phone in _phone:
+                        PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
+                        tenderee_phone.add(one_phone)
         if PackDict["Project"]["roleList"][i].role_name == "agency":
             _agent_phone = re.findall(re_agent_phone, content)
             if _agent_phone:
                 for _phone in _agent_phone:
-                    PackDict["Project"]["roleList"][i].linklist.append(("", _phone))
-                    agency_phone.add(_phone)
+                    _phone = _phone.split("/")
+                    for one_phone in _phone:
+                        PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
+                        agency_phone.add(one_phone)
             _agent_phone2 = re.findall(re_agent_phone2, content)
             if _agent_phone2:
                 for _phone in _agent_phone2:
-                    PackDict["Project"]["roleList"][i].linklist.append(("", _phone))
-                    agency_phone.add(_phone)
+                    _phone = _phone.split("/")
+                    for one_phone in _phone:
+                        PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
+                        agency_phone.add(one_phone)
     # km配对方法
     def dispatch(match_list):
         main_roles = list(set([match.main_role for match in match_list]))
@@ -1077,13 +1096,14 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
     key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
     phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
                        '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
-                       '0[^0]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
-                       '0[^0]\d{1,2}[-—-―]\d{7,8}转\d{1,4}|'
-                       '0[^0]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3|4|5|6|7|8|9]\d{9})|'
-                       '0[^0]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[^0]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|'
-                       '0[^0]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
-                       '0[^0]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
-                       '[\(|\(]0[^0]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|'
+                       # '0[^0]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
+                       '0[1-9]\d{1,2}[-—-―]\d{7,8}.?转\d{1,4}|'
+                       '0[1-9]\d{1,2}[-—-―]\d{7,8}[-—-―]\d{1,4}|'
+                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3|4|5|6|7|8|9]\d{9})|'
+                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|'
+                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
+                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
+                       '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|'
                        '[2-9]\d{6,7}')
     phone_entitys = []
     for _sentence in list_sentence:
@@ -1100,17 +1120,30 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
             res_set.add((i.group(), i.start(), i.end()))
         # for i in re.finditer(key_word, sentence_text):
         #     res_set.add((i.group(2), i.start() + len(i.group(1)), i.end()))
-        for item in list(res_set):
+        res_set = sorted(list(res_set),key=lambda x:x[1])
+        last_phone_mask = True
+        for item_idx in range(len(res_set)):
+            item = res_set[item_idx]
             phone_left = sentence_text[max(0, item[1] - 10):item[1]]
             phone_right = sentence_text[item[2]:item[2] + 8]
             # 排除“传真号”和其它错误项
-            if re.search("传,?真|信,?箱|邮,?箱", phone_left):
+            if re.search("传,?真|信,?箱|邮,?[件]", phone_left):
                 if not re.search("电,?话", phone_left):
+                    last_phone_mask = False
                     continue
             if re.search("帐,?号|编,?号|报,?价|证,?号|价,?格|[\((]万?元[\))]", phone_left):
+                last_phone_mask = False
                 continue
-            if re.search("[.,]\d{2,}", phone_right):
+            if re.search("^\d{0,4}[.,]\d{2,}", phone_right):
+                last_phone_mask = False
                 continue
+            # if:上一个phone实体不符合条件
+            if not last_phone_mask:
+                item_start = item[1]
+                last_item_end = res_set[item_idx-1][2]
+                if item_start - last_item_end<=1 or re.search("^\d+$",sentence_text[last_item_end:item_start]):
+                    last_phone_mask = False
+                    continue
             for j in range(len(list_tokenbegin)):
                 if list_tokenbegin[j] == item[1]:
                     begin_index = j
@@ -1125,6 +1158,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
             _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, begin_index, end_index, item[1],
                              item[2])
             phone_entitys.append(_entity)
+            last_phone_mask = True
+
     def is_company(entity,text):
         # 判断"公司"实体是否为地址地点
         if entity.label!=5 and entity.values[entity.label]>0.5:
@@ -1134,7 +1169,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
         entity_left = text[max(0,entity.wordOffset_begin-10):entity.wordOffset_begin]
         entity_left = re.sub(",()\(\)::","",entity_left)
         entity_left = entity_left[-5:]
-        if re.search("地址|地点",entity_left):
+        if re.search("地址|地点|银行[::]",entity_left):
             return False
         else:
             return True
@@ -1156,7 +1191,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
             _pre_data = pre_data[start:start+maxlen]
             _text_data = text_data[start:start+maxlen]
             relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data))
-            start = start + maxlen - 100
+            start = start + maxlen - 120
         # 去重结果
         relation_list = list(set(relation_list))
     # print(relation_list)
@@ -1364,12 +1399,17 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
         last_words_num = len(sentence.sentence_text)
 
     # 公司-联系人连接(km算法)
-    re_phone = re.compile('1[3-9][0-9][-—-]?\d{4}[-—-]?\d{4}|'
-                          '0\d{2,3}[-—-][1-9]\d{6,7}/[1-9]\d{6,10}|'
-                          '0\d{2,3}[-—-][1-9]\d{6,7}转\d{1,4}|'
-                          '0\d{2,3}[-—-]?[1-9]\d{6,7}|'
-                          '[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|'
-                          '[1-9]\d{6,7}')
+    re_phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
+                       '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
+                       '0[1-9]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
+                       '0[1-9]\d{1,2}[-—-―]\d{7,8}.?转\d{1,4}|'
+                       '0[1-9]\d{1,2}[-—-―]\d{7,8}[-—-―]\d{1,4}|'
+                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3|4|5|6|7|8|9]\d{9})|'
+                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|'
+                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
+                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
+                       '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|'
+                       '[2-9]\d{6,7}')
     key_phone = re.compile("联系方式|电话|联系人|负责人")
     temporary_list2 = []
     for entity in list_entity:
@@ -1553,8 +1593,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
     # km算法分配求解
     result2 = dispatch(match_list2)
     # print(result2)
-    linked_person = []
-    linked_persons_with = []
     for match in result2:
         entity = match[0]
         # print(entity.entity_text)
@@ -1563,7 +1601,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
         is_update = False
         if isinstance(match[1], tuple):
             person_ = ''
-            phone_ = [match[1][1]]
+            phone_ = match[1][1].split("/") # 分割多个号码
+            # print(person_,phone_)
         else:
             person_ = match[1].entity_text
             phone_ = [i.entity_text for i in match[1].person_phone] if match[1].person_phone else []
@@ -1576,6 +1615,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                                 if not phone_:
                                     PackDict[k]["roleList"][i].linklist.append((person_, ""))
                                 for p in phone_:
+                                    # if not person_ and len()
                                     PackDict[k]["roleList"][i].linklist.append((person_, p))
                                 is_update = True
                 elif PackDict[k]["roleList"][i].role_name == "agency":
@@ -1604,9 +1644,14 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
             if not list_entity[entity_index].pointer_person:
                 list_entity[entity_index].pointer_person = []
             list_entity[entity_index].pointer_person.append(match[1])
-            linked_person.append(match[1])
-            linked_persons_with.append(entity)
 
+    linked_person = []
+    linked_persons_with = []
+    for company_entity in [entity for entity in list_entity if entity.entity_type in ['company','org']]:
+        if company_entity.pointer_person:
+            for _person in company_entity.pointer_person:
+                linked_person.append(_person)
+                linked_persons_with.append(company_entity)
 
     # 一个公司对应多个联系人的补充
     person_entitys = [entity for entity in list_entity if entity.entity_type=='person']

+ 2 - 0
BiddingKG/dl/interface/modelFactory.py

@@ -261,6 +261,7 @@ class Model_relation_extraction():
     def predict(self,text_in, words, rate=0.5):
         # text_words = text_in
         triple_list = []
+        # print("tokens:",words)
         # _t2 = [self.words2id.get(c, 1) for c in words]
         _t2 = np.zeros((len(words), self.words_size))
         for i in range(len(words)):
@@ -293,6 +294,7 @@ class Model_relation_extraction():
                     _object = text_in[_ooo1]
                     _predicate = self.id2predicate[_c1]
                     triple_list.append((_subject[0], _predicate, _object))
+            # print([(t[0].entity_text,t[1],t[2].entity_text) for t in triple_list])
             return triple_list
         else:
             return []

+ 1 - 1
BiddingKG/dl/money/moneySource/ruleExtra.py

@@ -3,7 +3,7 @@ import os
 sys.path.append(os.path.abspath("../.."))
 import pandas as pd
 import re
-from BiddingKG.dl.interface import Entitys
+# from BiddingKG.dl.interface import Entitys
 
 def re_rule():
 

+ 30 - 0
BiddingKG/dl/relation_extraction/re_email.py

@@ -0,0 +1,30 @@
+import sys
+import os
+sys.path.append(os.path.abspath("../.."))
+import re
+# from BiddingKG.dl.interface import Entitys
+
+
+def extract_email(text):
+    re_pattern = re.compile("[a-zA-Z0-9][a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*@"
+                            "[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*(?:\.[a-zA-Z]{2,})")
+
+    list_email = []
+    for match_result in re_pattern.finditer(text):
+        entity_text = match_result.group()
+        wordOffset_begin = match_result.start()
+        wordOffset_end = match_result.end()
+        # print(text[wordOffset_begin:wordOffset_end])
+        _email = dict()
+        _email['body'] = entity_text
+        _email['begin_index'] = wordOffset_begin
+        _email['end_index'] = wordOffset_end
+        list_email.append(_email)
+    return list_email
+
+
+if __name__ == '__main__':
+    text ="联系人: 李春宜 联系电话:电话:0755-89663666-2492 邮箱:chun_yi.li@ci16-3mc.com.cn 邮箱:邮箱:chun_yi.li@qq.com"
+    # extract_email(text)
+    # print(extract_email(text))
+    pass

+ 5 - 3
BiddingKG/dl/test/test4.py

@@ -38,7 +38,8 @@ def test(name,content):
 if __name__=="__main__":
     # filename = "比地_52_79929693.html"
     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
-    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
+    # text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
+    text = codecs.open("C:\\Users\\Administrator\\Desktop\\test12354.txt", "r", encoding="utf8").read()
     content = str(BeautifulSoup(text).find("div",id="pcontent"))
     # df_a = {"html":[]}
     # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
@@ -65,8 +66,9 @@ if __name__=="__main__":
     # content = '''
     # 广州比地数据科技有限公司翻译服务工程招标
     # '''
-    print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
+    # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
     # print(predict("12", text))
-    # test("12",content)
+    # test("12",text)
+    test("12",content)
     print("takes",time.time()-_time1)
     pass

+ 6 - 1
BiddingKG/dl/test/测试整个要素提取流程.py

@@ -69,7 +69,7 @@ class MyEncoder(json.JSONEncoder):
 
 
 def predict(doc_id,text):
-    list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True)
+    list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","","",""]],useselffool=True)
     for articles in list_articles:
         print('预处理后文本信息')
         print(articles.content)
@@ -123,10 +123,15 @@ def predict(doc_id,text):
             if entity.entity_type=='person':
                 print("联系方式:",end=' ')
                 print(entity.entity_text,[i.entity_text for i in entity.person_phone] if entity.person_phone else None,entity.label,entity.values)
+                # print(entity.begin_index, entity.end_index)
                 print(entity.sentence_index)
+                pass
             elif entity.entity_type=="time":
                 print("time:",end=" ")
                 print(entity.entity_text, entity.label, entity.values)
+            elif entity.entity_type=="email":
+                print("email:",end=" ")
+                print(entity.entity_text, entity.begin_index, entity.end_index)
             elif entity.entity_type in ['org','company']:
                 _sentence = list_sentences[0][entity.sentence_index]
                 if entity.pointer_person: