Browse Source

Merge branch 'master' of http://192.168.2.65:3000/BIDI-ML/BIDI_ML_INFO_EXTRACTION

rogel 4 years ago
parent
commit
e9795cf24e
2 changed files with 25 additions and 24 deletions
  1. 23 22
      BiddingKG/dl/interface/predictor.py
  2. 2 2
      BiddingKG/dl/test/test4.py

+ 23 - 22
BiddingKG/dl/interface/predictor.py

@@ -248,7 +248,7 @@ class CodeNamePredict():
                 _LEN = MAX_AREA//MAX_LEN
                 #预测
 
-                # x = [[self.word2index.get(word,index_unk)for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
+                # x = [[self.word2index.get(word,index_pad)for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
                 x = [[getIndexOfWord(word) for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
                 x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x]
                 x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
@@ -370,27 +370,28 @@ class CodeNamePredict():
             list_name_freq_score = []
 
             # 2020/11/23 大网站规则调整
-            name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]'
-            for sentence in list_sentence:
-                # pad_sentence = sentence.sentence_text
-                othername = re.search(name_re1, sentence.sentence_text)
-                if othername != None:
-                    project_name = othername.group(3)
-                    beg = find_index([project_name], sentence.sentence_text)[0]
-                    end = beg + len(project_name)
-                    _name = self.fitDataByRule(sentence.sentence_text[beg:end])
-                    # add name to entitys
-                    _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
-                    sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
-                                     entity_type="name", sentence_index=sentence.sentence_index, begin_index=0,
-                                     end_index=0, wordOffset_begin=beg, wordOffset_end=end)
-                    list_entity.append(_entity)
-                    w = 1
-                    if _name not in dict_name_freq_score:
-                        # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
-                        dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w]
-                    else:
-                        dict_name_freq_score[_name][0] += 1
+            if len(dict_name_freq_score) == 0:
+                name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]'
+                for sentence in list_sentence:
+                    # pad_sentence = sentence.sentence_text
+                    othername = re.search(name_re1, sentence.sentence_text)
+                    if othername != None:
+                        project_name = othername.group(3)
+                        beg = find_index([project_name], sentence.sentence_text)[0]
+                        end = beg + len(project_name)
+                        _name = self.fitDataByRule(sentence.sentence_text[beg:end])
+                        # add name to entitys
+                        _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
+                        sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
+                                         entity_type="name", sentence_index=sentence.sentence_index, begin_index=0,
+                                         end_index=0, wordOffset_begin=beg, wordOffset_end=end)
+                        list_entity.append(_entity)
+                        w = 1
+                        if _name not in dict_name_freq_score:
+                            # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
+                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w]
+                        else:
+                            dict_name_freq_score[_name][0] += 1
                 # othername = re.search(name_re1, sentence.sentence_text)
                 # if othername != None:
                 #     _name = othername.group(3)

+ 2 - 2
BiddingKG/dl/test/test4.py

@@ -135,8 +135,8 @@ if __name__=="__main__":
     # 二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,'
     a = time.time()
     text = '''
-    SC2020113000007成交结果,一、项目信息,采购日期:2020-11-3011:39:12,采购单位:机械科学与工程学院,成交供应商:上海晨光科力普办公用品有限公司,支付方式:货到付款,订单编号:SC2020113000007,二、成交结果,商品名称:威联通(QNAP),网络存储服务器,TS-873,八盘位企业级nas,8G内存,64TB,1TSSD,八盘位企业级nas,8G内存,64TB,1TSSD,规格型号:TS-873,数量:1,:X,单价(元):24600.00,:=,小计(元):¥24600.00。
-成交金额::¥24600.00。'''
+    ,光大证券统一认证系统服务器硬件设备更新项目中标候选人公示,项目名称:光大证券统一认证系统服务器硬件设备更新项目,招标编号:CG-202011-030-001,公告日期:2020年12月3日,评标日期:2020年11月30日13时32分,评标地点:光大证券集中采购管理平台,推荐中标候选人:上海致为信息技术有限公司,联系人:殷志超,联系电话:021-22169419
+    '''
     print("start")
     # print(predict("12",content))
     print(predict("投诉处理公告", text))