Browse Source

电话提取正则规则优化

znj 2 years ago
parent
commit
dfa1392727
2 changed files with 42 additions and 6 deletions
  1. 1 1
      BiddingKG/dl/common/Utils.py
  2. 41 5
      BiddingKG/dl/interface/getAttributes.py

+ 1 - 1
BiddingKG/dl/common/Utils.py

@@ -878,7 +878,7 @@ def precision(y_true, y_pred):
 
 if __name__=="__main__":
     # print(fool_char_to_id[">"])
-    print(getUnifyMoney('壹拾柒万元'))
+    print(getUnifyMoney('伍仟贰佰壹拾伍万零捌佰壹拾伍元陆角伍分'))
     # model = getModel_w2v()
     # vocab,matrix = getVocabAndMatrix(model, Embedding_size=128)
     # save([vocab,matrix],"vocabMatrix_words.pk")

+ 41 - 5
BiddingKG/dl/interface/getAttributes.py

@@ -1283,30 +1283,52 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
             res_set.add((i.group(), i.start(), i.end()))
         res_set = sorted(list(res_set),key=lambda x:x[1])
         last_phone_mask = True
+        error_numStr_index = []
+        sentence_phone_list = []
         for item_idx in range(len(res_set)):
             item = res_set[item_idx]
             phone_left = sentence_text[max(0, item[1] - 10):item[1]]
-            phone_right = sentence_text[item[2]:item[2] + 8]
+            phone_right = sentence_text[item[2]:item[2] + 10]
+            phone_left_num = re.search("[\da-zA-Z\-—-―]+$",phone_left)
+            numStr_left = item[1]
+            if phone_left_num:
+                numStr_left -= len(phone_left_num.group())
+            phone_right_num = re.search("^[\da-zA-Z\-—-―]+",phone_right)
+            numStr_right = item[2]
+            if phone_right_num:
+                numStr_right += len(phone_right_num.group())
+            numStr_index = (numStr_left,numStr_right)
+
             if re.search("电话|手机|联系[人方]|联系方式",re.sub(",","",phone_left)):
                 pass
             else:
                 # 排除“传真号”和其它错误项
                 if re.search("传,?真|信,?箱|邮,?[箱件]|QQ|qq", phone_left):
                     if not re.search("电,?话", phone_left):
+                        error_numStr_index.append(numStr_index)
                         last_phone_mask = False
                         continue
-                if re.search("注册[证号]|帐号|编[号码]|报价|标价|证号|价格|型号|附件|代码|列号|行号|税号|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", re.sub(",","",phone_left)):
+                if re.search("身份证号?码?|注册[证号]|帐号|编[号码]|报价|标价|证号|价格|型号|附件|代码|列号|行号|税号|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", re.sub(",","",phone_left)):
+                    error_numStr_index.append(numStr_index)
                     last_phone_mask = False
                     continue
                 if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+|元", phone_right):
+                    error_numStr_index.append(numStr_index)
                     last_phone_mask = False
                     continue
                 # 号码含有0过多,不符合规则
-                if re.search("0{5,}",item[0]):
+                if re.search("0{6,}",item[0]):
+                    error_numStr_index.append(numStr_index)
                     last_phone_mask = False
                     continue
                 # 前后跟着字母
                 if re.search("[a-zA-Z/]+$", phone_left) or re.search("^[a-zA-Z/]+", phone_right):
+                    error_numStr_index.append(numStr_index)
+                    last_phone_mask = False
+                    continue
+                # 时间日期类排除
+                if re.search("时间|日期", phone_left):
+                    error_numStr_index.append(numStr_index)
                     last_phone_mask = False
                     continue
                 # 前后跟着长度小于一定值数字的正则排除
@@ -1315,10 +1337,12 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                     phone_right_number = re.search("^\d+[-—-―]?\d+",phone_right)
                     if phone_left_number:
                         if len(phone_left_number.group())<7:
+                            error_numStr_index.append(numStr_index)
                             last_phone_mask = False
                             continue
                     if phone_right_number:
                         if len(phone_right_number.group())<7:
+                            error_numStr_index.append(numStr_index)
                             last_phone_mask = False
                             continue
                 # if:上一个phone实体不符合条件
@@ -1326,8 +1350,21 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                     item_start = item[1]
                     last_item_end = res_set[item_idx-1][2]
                     if item_start - last_item_end<=1 or re.search("^[\da-zA-Z\-—-―、]+$",sentence_text[last_item_end:item_start]):
+                        error_numStr_index.append(numStr_index)
                         last_phone_mask = False
                         continue
+            sentence_phone_list.append(item)
+            last_phone_mask = True
+        if error_numStr_index:
+            drop_list = []
+            for item in sentence_phone_list:
+                for err_index in error_numStr_index:
+                    if (item[1]>=err_index[0] and item[1]<=err_index[1]) or (item[2]>=err_index[0] and item[2]<=err_index[1]) or (item[1]<=err_index[0] and item[2]>=err_index[1]):
+                        drop_list.append(item)
+                        break
+            for _drop_item in drop_list:
+                sentence_phone_list.remove(_drop_item)
+        for item in sentence_phone_list:
             for j in range(len(list_tokenbegin)):
                 if list_tokenbegin[j] == item[1]:
                     begin_index = j
@@ -1342,8 +1379,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
             _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, begin_index, end_index, item[1],
                              item[2],in_attachment=in_attachment)
             phone_entitys.append(_entity)
-            last_phone_mask = True
-
+    print('phone_set:',set([ent.entity_text for ent in phone_entitys]))
     def is_company(entity,text):
         # 判断"公司"实体是否为地址地点
         if entity.label!=5 and entity.values[entity.label]>0.5: