Kaynağa Gözat

Merge remote-tracking branch 'origin/master'

# Conflicts:
#	BiddingKG/dl/interface/Preprocessing.py
znj 3 yıl önce
ebeveyn
işleme
36c7e36d20

+ 1 - 1
BiddingKG/dl/entityLink/entityLink.py

@@ -38,7 +38,7 @@ def jaccard_score(source,target):
 
 
 def get_place_list():
-    path = os.path.abspath(__file__) + '/../../place_info.csv'
+    path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../")) + '/place_info.csv'
     place_df = pd.read_csv(path)
 
     place_list = []

+ 8 - 6
BiddingKG/dl/foolnltk/selffool/selffool_ner.py

@@ -143,9 +143,14 @@ class SelfNer():
             chars = list(text)
 
             for label, word in zip(ner_label, chars):
-                i += 1
+                # i += 1
 
                 if label == "O":
+                    if entity:
+                        # print('不完整实体:', lb, entity)
+                        ens.append((i - len(entity), i, lt, entity))
+                        entity = ""
+                    i += 1
                     continue
 
                 lt = label.split("_")[1]
@@ -161,13 +166,10 @@ class SelfNer():
 
                 elif lb == "E":
                     entity += word
-                    ens.append((i - len(entity), i + 1, lt, entity))
+                    ens.append((i - len(entity)+1, i + 1, lt, entity))
                     entity = ""
-
-            if entity:
-                ens.append((i - len(entity), i + 1, lt, entity))
+                i += 1
             all_entitys.append(ens)
-
         return all_entitys
     
     

+ 10 - 5
BiddingKG/dl/interface/Preprocessing.py

@@ -1687,8 +1687,11 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                     if k == 'text':
                         entity = v
                 b = it.start() + len(keyword)
-                e = it.end()-1
-                if (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
+                e = it.end() - 1
+                if (b, e, 'location', entity) in ner_entitys:
+                    ner_entitys.remove((b, e, 'location', entity))
+                    ner_entitys.append((b, e, 'company', entity))
+                elif (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
                     ner_entitys.append((b, e, 'company', entity))
 
             for it in re.finditer(
@@ -1701,6 +1704,9 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                         entity = v
                 b = it.start() + len(keyword)
                 e = it.end() - 1
+                if (b, e, 'location', entity) in ner_entitys:
+                    ner_entitys.remove((b, e, 'location', entity))
+                    ner_entitys.append((b, e, 'org', entity))
                 if (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
                     ner_entitys.append((b, e, 'org', entity))
 
@@ -1733,7 +1739,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 #去掉标点符号
                 entity_text = re.sub("[,,。:!&@$\*]","",entity_text)
                 entity_text = entity_text.replace("(","(").replace(")",")") if isinstance(entity_text,str) else entity_text
-                list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1]-1))
+                list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1]))
             # 标记文章末尾的"发布人”、“发布时间”实体
             if sentence_index==len(list_sentence)-1:
                 if len(list_sentence_entitys[-2:])>2:
@@ -1952,8 +1958,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                         else:
                             entity_text = str(getUnifyMoney(entity_text))
 
-                    # if float(entity_text)<100 or float(entity_text)>100000000000:
-                    if float(entity_text)<50 or float(entity_text)>100000000000:
+                    if float(entity_text)<100 or float(entity_text)>100000000000:
                         # print('过滤掉金额:float(entity_text)<100 or float(entity_text)>100000000000', entity_text, unit)
                         continue