vor 3 Jahren · 8c0dcd0129
--- a/BiddingKG/dl/bidway/re_bidway.py
+++ b/BiddingKG/dl/bidway/re_bidway.py
@@ -385,6 +385,32 @@ def extract_bidway(text, title):
 
				         # print(d.get("body"), d.get("begin_index"), d.get("end_index"))
			
 
				     return list_bidway
			
 
				 
			
 
				+bidway_dict = {'询价': '询价', '竞争性谈判': '竞争性谈判',
			
 
				+               '公开比选': '其他', '国内竞争性磋商': '竞争性磋商',
			
 
				+               '招标方式：t公开': '公开招标', '竞价': '竞价',
			
 
				+               '竞标': '竞价', '电子竞价': '竞价',
			
 
				+               '电子书面竞投': '竞价', '单一来源': '单一来源',
			
 
				+               '网上竞价': '竞价', '公开招标': '公开招标',
			
 
				+               '询比': '询价', '定点采购': '其他',
			
 
				+               '招标方式：■公开': '公开招标', '交易其他，付款其他': '其他',
			
 
				+               '竞争性评审': '竞争性磋商', '公开招租': '其他', '\\N': '',
			
 
				+               '比选': '其他', '比质比价': '其他', '分散采购': '其他',
			
 
				+               '内部邀标': '邀请招标', '邀请招标': '邀请招标',
			
 
				+               '网上招标': '公开招标', '非定向询价': '询价',
			
 
				+               '网络竞价': '竞价', '公开询价': '询价',
			
 
				+               '定点采购议价': '其他', '询单': '询价',
			
 
				+               '网上挂牌': '其他', '网上直购': '其他',
			
 
				+               '定向询价': '询价', '采购方式：公开': '公开招标',
			
 
				+               '磋商': '竞争性磋商', '公开招投标': '公开招标',
			
 
				+               '招标方式：√公开': '公开招标', '公开选取': '公开招标',
			
 
				+               '网上电子投标': '公开招标', '公开竞谈': '竞争性谈判',
			
 
				+               '竞争性磋商': '竞争性磋商', '采购方式：邀请': '邀请招标',
			
 
				+               '公开竞价': '竞价', '其他': '其他', '公开招募': '其他',
			
 
				+               '网上询价': '询价'}
			
 
				+# bidway名称统一规范
			
 
				+def bidway_integrate(bidway):
			
 
				+    integrate_name = bidway_dict.get(bidway,"其他")
			
 
				+    return integrate_name
			
 
				 
			
 
				 def test_csv():
			
 
				     df = pd.read_csv("C:\\Users\\Administrator\\Desktop\\bidway_text.csv")
			
@@ -441,7 +467,8 @@ def test_html():
 
				 if __name__ == "__main__":
			
 
				     # extract_bidway(s)
			
 
				 
			
 
				-    test_csv()
			
 
				-    # test_str()
			
 
				+    # test_csv()
			
 
				+    test_str()
			
 
				     # test_html()
			
 
				+    pass
			
 
				 
			
--- a/BiddingKG/dl/interface/Entitys.py
+++ b/BiddingKG/dl/interface/Entitys.py
@@ -165,6 +165,7 @@ class Entity():
 
				         self.pointer_tendereeMoney = None
			
 
				         # self.person_phone = person_phone
			
 
				         self.person_phone = []
			
 
				+        self.pointer_email = None
			
 
				         self.is_tail = False
			
 
				         self.notes = ''  # 2021/7/20 新增，保存金额大小写，单位等备注
			
 
				         self.money_unit = '' #2021/8/17 新增，保存金额单位 元、万元 、亿元
			
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -15,7 +15,8 @@ from BiddingKG.dl.interface.predictor import getPredictor
 
				 from BiddingKG.dl.common.nerUtils import *
			
 
				 from BiddingKG.dl.money.moneySource.ruleExtra import extract_moneySource
			
 
				 from BiddingKG.dl.time.re_servicetime import extract_servicetime
			
 
				-from BiddingKG.dl.bidway.re_bidway import extract_bidway
			
 
				+from BiddingKG.dl.relation_extraction.re_email import extract_email
			
 
				+from BiddingKG.dl.bidway.re_bidway import extract_bidway,bidway_integrate
			
 
				 from BiddingKG.dl.fingerprint.documentFingerprint import getFingerprint
			
 
				 from BiddingKG.dl.entityLink.entityLink import *
			
 
				 
			
@@ -437,7 +438,7 @@ def tableToText(soup):
 
				 
			
 
				         repairTable(inner_table)
			
 
				         head_list = sliceTable(inner_table)
			
 
				-
			
 
				+        # print("inner_table:",inner_table)
			
 
				 
			
 
				         return inner_table,head_list
			
 
				                     
			
@@ -1078,7 +1079,7 @@ def segment(soup,final=True):
 
				     #替换为中文分号
			
 
				     text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])","；",text)
			
 
				     #替换"？"为 " " ,update:2021/7/20
			
 
				-    text = re.sub("？+"," ",text)
			
 
				+    text = re.sub("？"," ",text)
			
 
				          
			
 
				 
			
 
				     #替换"""为"“",否则导入deepdive出错
			
@@ -1488,6 +1489,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
				         list_bidway = extract_bidway(article_processed, _title)
			
 
				         if list_bidway:
			
 
				             bidway = list_bidway[0].get("body")
			
 
				+            # bidway名称统一规范
			
 
				+            bidway = bidway_integrate(bidway)
			
 
				         else:
			
 
				             bidway = ""
			
 
				 
			
@@ -1920,6 +1923,48 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				                 else:
			
 
				                     index += 1
			
 
				 
			
 
				+            # "联系人"正则补充提取  2021/11/15 新增
			
 
				+            list_person_text = [entity.entity_text for entity in list_sentence_entitys if entity.entity_type=='person']
			
 
				+            error_text = ['传真','网址','电子邮','联系','联系电','联系地','采购代','邮政编','邮政','电话','手机','手机号','联系人','地址','地点','邮箱','邮编','联系方','招标','招标人','代理','代理人','采购','附件']
			
 
				+            list_person_text = set(list_person_text + error_text)
			
 
				+            re_person = re.compile("联系人[:：]([\u4e00-\u9fa5]{2,3})(?=联系)|"
			
 
				+                                   "联系人[:：]([\u4e00-\u9fa5]工)|"
			
 
				+                                   "联系人[:：]([\u4e00-\u9fa5]{2,3})")
			
 
				+            list_person = []
			
 
				+            for match_result in re_person.finditer(sentence_text):
			
 
				+                match_text = match_result.group()
			
 
				+                entity_text = match_text[4:]
			
 
				+                wordOffset_begin = match_result.start() + 4
			
 
				+                wordOffset_end = match_result.end()
			
 
				+                # print(text[wordOffset_begin:wordOffset_end])
			
 
				+                if entity_text not in list_person_text and entity_text[:2] not in list_person_text:
			
 
				+                    _person = dict()
			
 
				+                    _person['body'] = entity_text
			
 
				+                    _person['begin_index'] = wordOffset_begin
			
 
				+                    _person['end_index'] = wordOffset_end
			
 
				+                    list_person.append(_person)
			
 
				+            entity_type = "person"
			
 
				+            for person in list_person:
			
 
				+                begin_index_temp = person['begin_index']
			
 
				+                for j in range(len(list_tokenbegin)):
			
 
				+                    if list_tokenbegin[j] == begin_index_temp:
			
 
				+                        begin_index = j
			
 
				+                        break
			
 
				+                    elif list_tokenbegin[j] > begin_index_temp:
			
 
				+                        begin_index = j - 1
			
 
				+                        break
			
 
				+                index = person['end_index']
			
 
				+                end_index_temp = index
			
 
				+                for j in range(begin_index, len(list_tokenbegin)):
			
 
				+                    if list_tokenbegin[j] >= index:
			
 
				+                        end_index = j - 1
			
 
				+                        break
			
 
				+                entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
			
 
				+                entity_text = person['body']
			
 
				+                list_sentence_entitys.append(
			
 
				+                    Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
			
 
				+                           begin_index_temp, end_index_temp))
			
 
				+
			
 
				             # 资金来源提取  2020/12/30 新增
			
 
				             list_moneySource = extract_moneySource(sentence_text)
			
 
				             entity_type = "moneysource"
			
@@ -1944,6 +1989,30 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
			
 
				                            begin_index_temp, end_index_temp))
			
 
				 
			
 
				+            # 电子邮箱提取 2021/11/04 新增
			
 
				+            list_email = extract_email(sentence_text)
			
 
				+            entity_type = "email" # 电子邮箱
			
 
				+            for email in list_email:
			
 
				+                begin_index_temp = email['begin_index']
			
 
				+                for j in range(len(list_tokenbegin)):
			
 
				+                    if list_tokenbegin[j] == begin_index_temp:
			
 
				+                        begin_index = j
			
 
				+                        break
			
 
				+                    elif list_tokenbegin[j] > begin_index_temp:
			
 
				+                        begin_index = j - 1
			
 
				+                        break
			
 
				+                index = email['end_index']
			
 
				+                end_index_temp = index
			
 
				+                for j in range(begin_index, len(list_tokenbegin)):
			
 
				+                    if list_tokenbegin[j] >= index:
			
 
				+                        end_index = j - 1
			
 
				+                        break
			
 
				+                entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
			
 
				+                entity_text = email['body']
			
 
				+                list_sentence_entitys.append(
			
 
				+                    Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
			
 
				+                           begin_index_temp, end_index_temp))
			
 
				+
			
 
				             # 服务期限提取 2020/12/30 新增
			
 
				             list_servicetime = extract_servicetime(sentence_text)
			
 
				             entity_type = "serviceTime"
			
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -1000,23 +1000,34 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				             agency_contact.add(_person.entity_text)
			
 
				     # 正则匹配无 '主体/联系人' 的电话
			
 
				     # 例："采购人联系方式：0833-5226788，"
			
 
				+    phone_pattern = '(1[3|4|5|6|7|8|9][0-9][-—－―]?\d{4}[-—－―]?\d{4}|' \
			
 
				+                    '\+86.?1[3|4|5|6|7|8|9]\d{9}|' \
			
 
				+                    '0[1-9]\d{1,2}[-—－―][1-9]\d{6,7}/[1-9]\d{6,10}|' \
			
 
				+                    '0[1-9]\d{1,2}[-—－―]\d{7,8}.?转\d{1,4}|' \
			
 
				+                    '0[1-9]\d{1,2}[-—－―]\d{7,8}[-—－―]\d{1,4}|' \
			
 
				+                    '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?(?=1[3|4|5|6|7|8|9]\d{9})|' \
			
 
				+                   '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?)|' \
			
 
				+                   '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|' \
			
 
				+                   '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?|' \
			
 
				+                   '[\（|\(]0[1-9]\d{1,2}[\）|\)]-?\d{7,8}-?\d{,4}|' \
			
 
				+                   '[2-9]\d{6,7})'
			
 
				     re_tenderee_phone = re.compile(
			
 
				         "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,5}(?:电话|联系方式|联系人)[:：]?[^。]{0,7}?)"
			
 
				         # 电话号码
			
 
				-        "(1[3-9][0-9][-—－]?\d{4}[-—－]?\d{4}|0\d{2,3}[-—－][1-9]\d{6,7}/[1-9]\d{6,7}|0\d{2,3}[-—－][1-9]\d{6,7}转\d{1,4}|0\d{2,3}[-—－]?[1-9]\d{6,7}|[\（|\(]0\d{2,3}[\）|\)]-?\d{7,8}-?\d{,4}|[1-9]\d{6,7})(?:[^\.]|$)")
			
 
				+        + phone_pattern)
			
 
				     # 例："采购人地址和联系方式：峨边彝族自治县教育局，0833-5226788，"
			
 
				     re_tenderee_phone2 = re.compile(
			
 
				         "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人)[:：]?[^。]{0,20}?)"
			
 
				         # 电话号码
			
 
				-        "(1[3-9][0-9][-—－]?\d{4}[-—－]?\d{4}|0\d{2,3}[-—－][1-9]\d{6,7}/[1-9]\d{6,7}|0\d{2,3}[-—－][1-9]\d{6,7}转\d{1,4}|0\d{2,3}[-—－]?[1-9]\d{6,7}|[\（|\(]0\d{2,3}[\）|\)]-?\d{7,8}-?\d{,4}|[1-9]\d{6,7})(?:[^\.]|$)")
			
 
				+        + phone_pattern)
			
 
				     re_agent_phone = re.compile(
			
 
				         "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,5}(?:电话|联系方式|联系人)[:：]?[^。]{0,7}?)"
			
 
				         # 电话号码
			
 
				-        "(1[3-9][0-9][-—－]?\d{4}[-—－]?\d{4}|0\d{2,3}[-—－][1-9]\d{6,7}/[1-9]\d{6,7}|0\d{2,3}[-—－][1-9]\d{6,7}转\d{1,4}|0\d{2,3}[-—－]?[1-9]\d{6,7}|[\（|\(]0\d{2,3}[\）|\)]-?\d{7,8}-?\d{,4}|[1-9]\d{6,7})(?:[^\.]|$)")
			
 
				+        + phone_pattern)
			
 
				     re_agent_phone2 = re.compile(
			
 
				         "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人)[:：]?[^。]{0,20}?)"
			
 
				         # 电话号码
			
 
				-        "(1[3-9][0-9][-—－]?\d{4}[-—－]?\d{4}|0\d{2,3}[-—－][1-9]\d{6,7}/[1-9]\d{6,7}|0\d{2,3}[-—－][1-9]\d{6,7}转\d{1,4}|0\d{2,3}[-—－]?[1-9]\d{6,7}|[\（|\(]0\d{2,3}[\）|\)]-?\d{7,8}-?\d{,4}|[1-9]\d{6,7})(?:[^\.]|$)")
			
 
				+        + phone_pattern)
			
 
				     content = ""
			
 
				     for _sentence in list_sentence:
			
 
				         content += "".join(_sentence.tokens)
			
@@ -1036,24 +1047,32 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				             _tenderee_phone = re.findall(re_tenderee_phone, content)
			
 
				             if _tenderee_phone:
			
 
				                 for _phone in _tenderee_phone:
			
 
				-                    PackDict["Project"]["roleList"][i].linklist.append(("", _phone))
			
 
				-                    tenderee_phone.add(_phone)
			
 
				+                    _phone = _phone.split("/") # 分割多个号码
			
 
				+                    for one_phone in _phone:
			
 
				+                        PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
			
 
				+                        tenderee_phone.add(one_phone)
			
 
				             _tenderee_phone2 = re.findall(re_tenderee_phone2, content)
			
 
				             if _tenderee_phone2:
			
 
				                 for _phone in _tenderee_phone2:
			
 
				-                    PackDict["Project"]["roleList"][i].linklist.append(("", _phone))
			
 
				-                    tenderee_phone.add(_phone)
			
 
				+                    _phone = _phone.split("/")
			
 
				+                    for one_phone in _phone:
			
 
				+                        PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
			
 
				+                        tenderee_phone.add(one_phone)
			
 
				         if PackDict["Project"]["roleList"][i].role_name == "agency":
			
 
				             _agent_phone = re.findall(re_agent_phone, content)
			
 
				             if _agent_phone:
			
 
				                 for _phone in _agent_phone:
			
 
				-                    PackDict["Project"]["roleList"][i].linklist.append(("", _phone))
			
 
				-                    agency_phone.add(_phone)
			
 
				+                    _phone = _phone.split("/")
			
 
				+                    for one_phone in _phone:
			
 
				+                        PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
			
 
				+                        agency_phone.add(one_phone)
			
 
				             _agent_phone2 = re.findall(re_agent_phone2, content)
			
 
				             if _agent_phone2:
			
 
				                 for _phone in _agent_phone2:
			
 
				-                    PackDict["Project"]["roleList"][i].linklist.append(("", _phone))
			
 
				-                    agency_phone.add(_phone)
			
 
				+                    _phone = _phone.split("/")
			
 
				+                    for one_phone in _phone:
			
 
				+                        PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
			
 
				+                        agency_phone.add(one_phone)
			
 
				     # km配对方法
			
 
				     def dispatch(match_list):
			
 
				         main_roles = list(set([match.main_role for match in match_list]))
			
@@ -1077,13 +1096,14 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				     key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
			
 
				     phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—－―]?\d{4}[-—－―]?\d{4}|'
			
 
				                        '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
			
 
				-                       '0[^0]\d{1,2}[-—－―][1-9]\d{6,7}/[1-9]\d{6,10}|'
			
 
				-                       '0[^0]\d{1,2}[-—－―]\d{7,8}转\d{1,4}|'
			
 
				-                       '0[^0]\d{1,2}[-—－―]?[1-9]\d{6}\d?(?=1[3|4|5|6|7|8|9]\d{9})|'
			
 
				-                       '0[^0]\d{1,2}[-—－―]?[1-9]\d{6}\d?(?=0[^0]\d{1,2}[-—－―]?[1-9]\d{6}\d?)|'
			
 
				-                       '0[^0]\d{1,2}[-—－―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
			
 
				-                       '0[^0]\d{1,2}[-—－―]?[1-9]\d{6}\d?|'
			
 
				-                       '[\（|\(]0[^0]\d{1,2}[\）|\)]-?\d{7,8}-?\d{,4}|'
			
 
				+                       # '0[^0]\d{1,2}[-—－―][1-9]\d{6,7}/[1-9]\d{6,10}|'
			
 
				+                       '0[1-9]\d{1,2}[-—－―]\d{7,8}.?转\d{1,4}|'
			
 
				+                       '0[1-9]\d{1,2}[-—－―]\d{7,8}[-—－―]\d{1,4}|'
			
 
				+                       '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?(?=1[3|4|5|6|7|8|9]\d{9})|'
			
 
				+                       '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?)|'
			
 
				+                       '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
			
 
				+                       '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?|'
			
 
				+                       '[\（|\(]0[1-9]\d{1,2}[\）|\)]-?\d{7,8}-?\d{,4}|'
			
 
				                        '[2-9]\d{6,7}')
			
 
				     phone_entitys = []
			
 
				     for _sentence in list_sentence:
			
@@ -1100,17 +1120,30 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				             res_set.add((i.group(), i.start(), i.end()))
			
 
				         # for i in re.finditer(key_word, sentence_text):
			
 
				         #     res_set.add((i.group(2), i.start() + len(i.group(1)), i.end()))
			
 
				-        for item in list(res_set):
			
 
				+        res_set = sorted(list(res_set),key=lambda x:x[1])
			
 
				+        last_phone_mask = True
			
 
				+        for item_idx in range(len(res_set)):
			
 
				+            item = res_set[item_idx]
			
 
				             phone_left = sentence_text[max(0, item[1] - 10):item[1]]
			
 
				             phone_right = sentence_text[item[2]:item[2] + 8]
			
 
				             # 排除“传真号”和其它错误项
			
 
				-            if re.search("传，?真|信，?箱|邮，?箱", phone_left):
			
 
				+            if re.search("传，?真|信，?箱|邮，?[箱件]", phone_left):
			
 
				                 if not re.search("电，?话", phone_left):
			
 
				+                    last_phone_mask = False
			
 
				                     continue
			
 
				             if re.search("帐，?号|编，?号|报，?价|证，?号|价，?格|[\(（]万?元[\)）]", phone_left):
			
 
				+                last_phone_mask = False
			
 
				                 continue
			
 
				-            if re.search("[.,]\d{2,}", phone_right):
			
 
				+            if re.search("^\d{0,4}[.,]\d{2,}", phone_right):
			
 
				+                last_phone_mask = False
			
 
				                 continue
			
 
				+            # if:上一个phone实体不符合条件
			
 
				+            if not last_phone_mask:
			
 
				+                item_start = item[1]
			
 
				+                last_item_end = res_set[item_idx-1][2]
			
 
				+                if item_start - last_item_end<=1 or re.search("^\d+$",sentence_text[last_item_end:item_start]):
			
 
				+                    last_phone_mask = False
			
 
				+                    continue
			
 
				             for j in range(len(list_tokenbegin)):
			
 
				                 if list_tokenbegin[j] == item[1]:
			
 
				                     begin_index = j
			
@@ -1125,6 +1158,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				             _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, begin_index, end_index, item[1],
			
 
				                              item[2])
			
 
				             phone_entitys.append(_entity)
			
 
				+            last_phone_mask = True
			
 
				+
			
 
				     def is_company(entity,text):
			
 
				         # 判断"公司"实体是否为地址地点
			
 
				         if entity.label!=5 and entity.values[entity.label]>0.5:
			
@@ -1134,7 +1169,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				         entity_left = text[max(0,entity.wordOffset_begin-10):entity.wordOffset_begin]
			
 
				         entity_left = re.sub("，（）\(\):：","",entity_left)
			
 
				         entity_left = entity_left[-5:]
			
 
				-        if re.search("地址|地点",entity_left):
			
 
				+        if re.search("地址|地点|银行[：:]",entity_left):
			
 
				             return False
			
 
				         else:
			
 
				             return True
			
@@ -1156,7 +1191,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				             _pre_data = pre_data[start:start+maxlen]
			
 
				             _text_data = text_data[start:start+maxlen]
			
 
				             relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data))
			
 
				-            start = start + maxlen - 100
			
 
				+            start = start + maxlen - 120
			
 
				         # 去重结果
			
 
				         relation_list = list(set(relation_list))
			
 
				     # print(relation_list)
			
@@ -1364,12 +1399,17 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				         last_words_num = len(sentence.sentence_text)
			
 
				 
			
 
				     # 公司-联系人连接（km算法）
			
 
				-    re_phone = re.compile('1[3-9][0-9][-—－]?\d{4}[-—－]?\d{4}|'
			
 
				-                          '0\d{2,3}[-—－][1-9]\d{6,7}/[1-9]\d{6,10}|'
			
 
				-                          '0\d{2,3}[-—－][1-9]\d{6,7}转\d{1,4}|'
			
 
				-                          '0\d{2,3}[-—－]?[1-9]\d{6,7}|'
			
 
				-                          '[\（|\(]0\d{2,3}[\）|\)]-?\d{7,8}-?\d{,4}|'
			
 
				-                          '[1-9]\d{6,7}')
			
 
				+    re_phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—－―]?\d{4}[-—－―]?\d{4}|'
			
 
				+                       '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
			
 
				+                       '0[1-9]\d{1,2}[-—－―][1-9]\d{6,7}/[1-9]\d{6,10}|'
			
 
				+                       '0[1-9]\d{1,2}[-—－―]\d{7,8}.?转\d{1,4}|'
			
 
				+                       '0[1-9]\d{1,2}[-—－―]\d{7,8}[-—－―]\d{1,4}|'
			
 
				+                       '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?(?=1[3|4|5|6|7|8|9]\d{9})|'
			
 
				+                       '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?)|'
			
 
				+                       '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
			
 
				+                       '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?|'
			
 
				+                       '[\（|\(]0[1-9]\d{1,2}[\）|\)]-?\d{7,8}-?\d{,4}|'
			
 
				+                       '[2-9]\d{6,7}')
			
 
				     key_phone = re.compile("联系方式|电话|联系人|负责人")
			
 
				     temporary_list2 = []
			
 
				     for entity in list_entity:
			
@@ -1553,8 +1593,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				     # km算法分配求解
			
 
				     result2 = dispatch(match_list2)
			
 
				     # print(result2)
			
 
				-    linked_person = []
			
 
				-    linked_persons_with = []
			
 
				     for match in result2:
			
 
				         entity = match[0]
			
 
				         # print(entity.entity_text)
			
@@ -1563,7 +1601,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				         is_update = False
			
 
				         if isinstance(match[1], tuple):
			
 
				             person_ = ''
			
 
				-            phone_ = [match[1][1]]
			
 
				+            phone_ = match[1][1].split("/") # 分割多个号码
			
 
				+            # print(person_,phone_)
			
 
				         else:
			
 
				             person_ = match[1].entity_text
			
 
				             phone_ = [i.entity_text for i in match[1].person_phone] if match[1].person_phone else []
			
@@ -1576,6 +1615,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				                                 if not phone_:
			
 
				                                     PackDict[k]["roleList"][i].linklist.append((person_, ""))
			
 
				                                 for p in phone_:
			
 
				+                                    # if not person_ and len()
			
 
				                                     PackDict[k]["roleList"][i].linklist.append((person_, p))
			
 
				                                 is_update = True
			
 
				                 elif PackDict[k]["roleList"][i].role_name == "agency":
			
@@ -1604,9 +1644,14 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				             if not list_entity[entity_index].pointer_person:
			
 
				                 list_entity[entity_index].pointer_person = []
			
 
				             list_entity[entity_index].pointer_person.append(match[1])
			
 
				-            linked_person.append(match[1])
			
 
				-            linked_persons_with.append(entity)
			
 
				 
			
 
				+    linked_person = []
			
 
				+    linked_persons_with = []
			
 
				+    for company_entity in [entity for entity in list_entity if entity.entity_type in ['company','org']]:
			
 
				+        if company_entity.pointer_person:
			
 
				+            for _person in company_entity.pointer_person:
			
 
				+                linked_person.append(_person)
			
 
				+                linked_persons_with.append(company_entity)
			
 
				 
			
 
				     # 一个公司对应多个联系人的补充
			
 
				     person_entitys = [entity for entity in list_entity if entity.entity_type=='person']
			
--- a/BiddingKG/dl/interface/modelFactory.py
+++ b/BiddingKG/dl/interface/modelFactory.py
@@ -261,6 +261,7 @@ class Model_relation_extraction():
 
				     def predict(self,text_in, words, rate=0.5):
			
 
				         # text_words = text_in
			
 
				         triple_list = []
			
 
				+        # print("tokens:",words)
			
 
				         # _t2 = [self.words2id.get(c, 1) for c in words]
			
 
				         _t2 = np.zeros((len(words), self.words_size))
			
 
				         for i in range(len(words)):
			
@@ -293,6 +294,7 @@ class Model_relation_extraction():
 
				                     _object = text_in[_ooo1]
			
 
				                     _predicate = self.id2predicate[_c1]
			
 
				                     triple_list.append((_subject[0], _predicate, _object))
			
 
				+            # print([(t[0].entity_text,t[1],t[2].entity_text) for t in triple_list])
			
 
				             return triple_list
			
 
				         else:
			
 
				             return []
			
--- a/BiddingKG/dl/money/moneySource/ruleExtra.py
+++ b/BiddingKG/dl/money/moneySource/ruleExtra.py
@@ -3,7 +3,7 @@ import os
 
				 sys.path.append(os.path.abspath("../.."))
			
 
				 import pandas as pd
			
 
				 import re
			
 
				-from BiddingKG.dl.interface import Entitys
			
 
				+# from BiddingKG.dl.interface import Entitys
			
 
				 
			
 
				 def re_rule():
			
 
				 
			
--- a/BiddingKG/dl/relation_extraction/re_email.py
+++ b/BiddingKG/dl/relation_extraction/re_email.py
@@ -0,0 +1,30 @@
 
				+import sys
			
 
				+import os
			
 
				+sys.path.append(os.path.abspath("../.."))
			
 
				+import re
			
 
				+# from BiddingKG.dl.interface import Entitys
			
 
				+
			
 
				+
			
 
				+def extract_email(text):
			
 
				+    re_pattern = re.compile("[a-zA-Z0-9][a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*@"
			
 
				+                            "[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*(?:\.[a-zA-Z]{2,})")
			
 
				+
			
 
				+    list_email = []
			
 
				+    for match_result in re_pattern.finditer(text):
			
 
				+        entity_text = match_result.group()
			
 
				+        wordOffset_begin = match_result.start()
			
 
				+        wordOffset_end = match_result.end()
			
 
				+        # print(text[wordOffset_begin:wordOffset_end])
			
 
				+        _email = dict()
			
 
				+        _email['body'] = entity_text
			
 
				+        _email['begin_index'] = wordOffset_begin
			
 
				+        _email['end_index'] = wordOffset_end
			
 
				+        list_email.append(_email)
			
 
				+    return list_email
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    text ="联系人： 李春宜 联系电话：电话：0755-89663666-2492 邮箱：chun_yi.li@ci16-3mc.com.cn 邮箱：邮箱：chun_yi.li@qq.com"
			
 
				+    # extract_email(text)
			
 
				+    # print(extract_email(text))
			
 
				+    pass
			
--- a/BiddingKG/dl/test/test4.py
+++ b/BiddingKG/dl/test/test4.py
@@ -38,7 +38,8 @@ def test(name,content):
 
				 if __name__=="__main__":
			
 
				     # filename = "比地_52_79929693.html"
			
 
				     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
			
 
				-    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
			
 
				+    # text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
			
 
				+    text = codecs.open("C:\\Users\\Administrator\\Desktop\\test12354.txt", "r", encoding="utf8").read()
			
 
				     content = str(BeautifulSoup(text).find("div",id="pcontent"))
			
 
				     # df_a = {"html":[]}
			
 
				     # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
			
@@ -65,8 +66,9 @@ if __name__=="__main__":
 
				     # content = '''
			
 
				     # 广州比地数据科技有限公司翻译服务工程招标
			
 
				     # '''
			
 
				-    print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
			
 
				+    # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
			
 
				     # print(predict("12", text))
			
 
				-    # test("12",content)
			
 
				+    # test("12",text)
			
 
				+    test("12",content)
			
 
				     print("takes",time.time()-_time1)
			
 
				     pass
			
--- a/BiddingKG/dl/test/测试整个要素提取流程.py
+++ b/BiddingKG/dl/test/测试整个要素提取流程.py
@@ -69,7 +69,7 @@ class MyEncoder(json.JSONEncoder):
 
				 
			
 
				 
			
 
				 def predict(doc_id,text):
			
 
				-    list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True)
			
 
				+    list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","","",""]],useselffool=True)
			
 
				     for articles in list_articles:
			
 
				         print('预处理后文本信息')
			
 
				         print(articles.content)
			
@@ -123,10 +123,15 @@ def predict(doc_id,text):
 
				             if entity.entity_type=='person':
			
 
				                 print("联系方式：",end=' ')
			
 
				                 print(entity.entity_text,[i.entity_text for i in entity.person_phone] if entity.person_phone else None,entity.label,entity.values)
			
 
				+                # print(entity.begin_index, entity.end_index)
			
 
				                 print(entity.sentence_index)
			
 
				+                pass
			
 
				             elif entity.entity_type=="time":
			
 
				                 print("time:",end=" ")
			
 
				                 print(entity.entity_text, entity.label, entity.values)
			
 
				+            elif entity.entity_type=="email":
			
 
				+                print("email:",end=" ")
			
 
				+                print(entity.entity_text, entity.begin_index, entity.end_index)
			
 
				             elif entity.entity_type in ['org','company']:
			
 
				                 _sentence = list_sentences[0][entity.sentence_index]
			
 
				                 if entity.pointer_person: