Explorar o código

Merge remote-tracking branch 'origin/master'

# Conflicts:
#	BiddingKG/dl/entityLink/entityLink.py
luojiehua %!s(int64=3) %!d(string=hai) anos
pai
achega
d66544fc15

+ 0 - 2
BiddingKG/dl/entityLink/entityLink.py

@@ -370,8 +370,6 @@ def fix_LEGAL_ENTERPRISE():
                 if not line:
                     break
                 line = line.strip()
-                if line=="工会委员会":
-                    print(line,isLegalEnterprise(line))
                 if isLegalEnterprise(line):
                     set_enter.add(line)
 

+ 8 - 6
BiddingKG/dl/foolnltk/selffool/selffool_ner.py

@@ -143,9 +143,14 @@ class SelfNer():
             chars = list(text)
 
             for label, word in zip(ner_label, chars):
-                i += 1
+                # i += 1
 
                 if label == "O":
+                    if entity:
+                        # print('不完整实体:', lb, entity)
+                        ens.append((i - len(entity), i, lt, entity))
+                        entity = ""
+                    i += 1
                     continue
 
                 lt = label.split("_")[1]
@@ -161,13 +166,10 @@ class SelfNer():
 
                 elif lb == "E":
                     entity += word
-                    ens.append((i - len(entity), i + 1, lt, entity))
+                    ens.append((i - len(entity)+1, i + 1, lt, entity))
                     entity = ""
-
-            if entity:
-                ens.append((i - len(entity), i + 1, lt, entity))
+                i += 1
             all_entitys.append(ens)
-
         return all_entitys
     
     

+ 8 - 2
BiddingKG/dl/interface/Preprocessing.py

@@ -1688,7 +1688,10 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                         entity = v
                 b = it.start() + len(keyword)
                 e = it.end() - 1
-                if (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
+                if (b, e, 'location', entity) in ner_entitys:
+                    ner_entitys.remove((b, e, 'location', entity))
+                    ner_entitys.append((b, e, 'company', entity))
+                elif (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
                     ner_entitys.append((b, e, 'company', entity))
 
             for it in re.finditer(
@@ -1701,6 +1704,9 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                         entity = v
                 b = it.start() + len(keyword)
                 e = it.end() - 1
+                if (b, e, 'location', entity) in ner_entitys:
+                    ner_entitys.remove((b, e, 'location', entity))
+                    ner_entitys.append((b, e, 'org', entity))
                 if (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
                     ner_entitys.append((b, e, 'org', entity))
 
@@ -1733,7 +1739,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 #去掉标点符号
                 entity_text = re.sub("[,,。:!&@$\*]","",entity_text)
                 entity_text = entity_text.replace("(","(").replace(")",")") if isinstance(entity_text,str) else entity_text
-                list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1]-1))
+                list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1]))
             # 标记文章末尾的"发布人”、“发布时间”实体
             if sentence_index==len(list_sentence)-1:
                 if len(list_sentence_entitys[-2:])>2: