浏览代码

采购意向错误修正

znj 3 年之前
父节点
当前提交
ef6b0901c3

+ 10 - 7
BiddingKG/dl/entityLink/entityLink.py

@@ -128,17 +128,20 @@ def doctitle_refine(doctitle):
     return _doctitle_refine
 # 前100个公司实体
 def get_nlp_enterprise(list_entity):
-    count = 0
     nlp_enterprise = []
+    nlp_enterprise_attachment = []
+    max_num = 100
     list_entity = sorted(list_entity,key=lambda x:(x.sentence_index,x.begin_index))
     for entity in list_entity:
         if entity.entity_type in ['org','company']:
-            if entity.entity_text not in nlp_enterprise:
-                nlp_enterprise.append(entity.entity_text)
-                count += 1
-                if count>=100:
-                    break
-    return nlp_enterprise
+            if not entity.in_attachment:
+                if entity.entity_text not in nlp_enterprise:
+                    nlp_enterprise.append(entity.entity_text)
+            else:
+                if entity.entity_text not in nlp_enterprise_attachment:
+                    nlp_enterprise_attachment.append(entity.entity_text)
+
+    return nlp_enterprise[:max_num],nlp_enterprise_attachment[:max_num]
 
 def getEnterprisePath():
     filename = "LEGAL_ENTERPRISE.txt"

+ 2 - 1
BiddingKG/dl/interface/extract.py

@@ -195,7 +195,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
     start_time = time.time()  # 实体链接
     entityLink.link_entitys(list_entitys)
     doctitle_refine = entityLink.doctitle_refine(title)
-    nlp_enterprise = entityLink.get_nlp_enterprise(list_entitys[0])
+    nlp_enterprise,nlp_enterprise_attachment = entityLink.get_nlp_enterprise(list_entitys[0])
     prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines)
     log("get attributes done of doc_id%s"%(doc_id))
     cost_time["attrs"] = round(time.time()-start_time,2)
@@ -249,6 +249,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason)
     data_res["doctitle_refine"] = doctitle_refine
     data_res["nlp_enterprise"] = nlp_enterprise
+    data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment
     # 要素的个数
     data_res['extract_count'] = extractCount(data_res)
     # 是否有表格

+ 6 - 1
BiddingKG/dl/interface/getAttributes.py

@@ -2836,7 +2836,12 @@ def getOtherAttributes(list_entity):
         elif entity.entity_type=='moneysource':
             dict_other["moneysource"] = entity.entity_text
         elif entity.entity_type=='serviceTime':
-            dict_other["serviceTime"] = entity.entity_text
+            if re.search("[^之]日|天|年|月|周|星期", entity.entity_text) or re.search("\d{4}[\-\./]\d{1,2}", entity.entity_text):
+                if not entity.in_attachment:
+                    dict_other["serviceTime"] = entity.entity_text
+                else:
+                    if not dict_other["serviceTime"]:
+                        dict_other["serviceTime"] = entity.entity_text
         elif entity.entity_type=="person" and entity.label ==4:
             dict_other["person_review"].append(entity.entity_text)
         elif entity.entity_type=='product':

+ 28 - 18
BiddingKG/dl/interface/predictor.py

@@ -2206,7 +2206,7 @@ class ProductAttributesPredictor():
                 elif re.search('预算', items[j]):
                     header_dic['预算'] = j
                     budget = items[j]
-                elif re.search('时间|采购实施月份|采购月份', items[j]):
+                elif re.search('时间|采购实施月份|采购月份|采购日期', items[j]):
                     header_dic['时间'] = j
                     order_time = items[j]
 
@@ -2250,7 +2250,6 @@ class ProductAttributesPredictor():
             i = 0
             found_header = False
             header_colnum = 0
-
             if flag_yx:
                 col0_l = []
                 col1_l = []
@@ -2271,11 +2270,15 @@ class ProductAttributesPredictor():
                         elif re.search('采购预算|预算金额', col0_l[i]):
                             header_list2.append(col0_l[i])
                             budget = col1_l[i]
-                            if '万元' in col0_l[i] and '万' not in budget:
-                                budget += '万元'
-                            budget = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", budget)
-                            budget = str(getUnifyMoney(budget))
-                        elif re.search('采购时间|采购实施月份|采购月份', col0_l[i]):
+                            re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?", budget)
+                            if re_price:
+                                budget = re_price[0]
+                                if '万元' in col0_l[i] and '万' not in budget:
+                                    budget += '万元'
+                                budget = str(getUnifyMoney(budget))
+                            else:
+                                budget = ""
+                        elif re.search('采购时间|采购实施月份|采购月份|采购日期', col0_l[i]):
                             header_list2.append(col0_l[i])
                             order_time = col1_l[i].strip()
                             order_begin, order_end = self.fix_time(order_time, html, page_time)
@@ -2292,7 +2295,6 @@ class ProductAttributesPredictor():
                             demand_link.append(link)
                             headers_demand.append('_'.join(header_list2))
                         continue
-
             while i < (len(inner_table)):
                 tds = inner_table[i]
                 not_empty = [it for it in tds if it != ""]
@@ -2309,7 +2311,6 @@ class ProductAttributesPredictor():
                 order_time = ""  # 采购时间
                 order_begin = ""
                 order_end = ""
-
                 if len(set(tds) & self.header_set) > len(tds) * 0.2:
                     header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2)
                     if found_header:
@@ -2343,10 +2344,15 @@ class ProductAttributesPredictor():
                         if id3 != "":
                             if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
                                 unitPrice = tds[id3]
-                                if '万元' in header_list[2] and '万' not in unitPrice:
-                                    unitPrice += '万元'
-                                unitPrice = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", unitPrice)
-                                unitPrice = str(getUnifyMoney(unitPrice))
+                                re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?",unitPrice)
+                                if re_price:
+                                    unitPrice = re_price[0]
+                                    if '万元' in header_list[2] and '万' not in unitPrice:
+                                        unitPrice += '万元'
+                                    # unitPrice = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", unitPrice)
+                                    unitPrice = str(getUnifyMoney(unitPrice))
+                                else:
+                                    unitPrice = ""
                             else:
                                 unitPrice = ""
                         if id4 != "":
@@ -2367,10 +2373,14 @@ class ProductAttributesPredictor():
                         if id7 != "":
                             if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id7]):
                                 budget = tds[id7]
-                                if '万元' in header_list2[2] and '万' not in budget:
-                                    budget += '万元'
-                                budget = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", budget)
-                                budget = str(getUnifyMoney(budget))
+                                re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?", budget)
+                                if re_price:
+                                    budget = re_price[0]
+                                    if '万元' in header_list[2] and '万' not in budget:
+                                        budget += '万元'
+                                    budget = str(getUnifyMoney(budget))
+                                else:
+                                    budget = ""
                             else:
                                 budget = ""
                         if id8 != "":
@@ -2416,7 +2426,7 @@ class ProductAttributesPredictor():
             list_sentence = list_sentences[0]
             list_entity = list_entitys[0]
             _data = product_attrs[1]['demand_info']['data']
-            re_bidding_time = re.compile("(采购时间|采购实施月份|采购月份)[::,].{0,2}$")
+            re_bidding_time = re.compile("(采购时间|采购实施月份|采购月份|采购日期)[::,].{0,2}$")
             order_times = []
             for entity in list_entity:
                 if entity.entity_type=='time':

+ 1 - 1
BiddingKG/dl/time/re_servicetime.py

@@ -192,7 +192,7 @@ if __name__ == '__main__':
 
 (6)信用记录:供应商未被列入“信用中国”网站(www.creditchina.gov.cn)“记录失信被执行人或重大税收违法案件当事人名单”记录名单; 不处于中国政府采购网(www.ccgp.gov.cn)“政府采购严重违法失信行为信息记录”中的禁止参加政府采购活动期间。
     """
-    # s = "自合同签订之日起至2022-6-30 自合同签订之日起至2022-07-30"
+    s = "合同履行日期: 五、"
     print(extract_servicetime(s))
 
     # df = pd.read_csv("C:\\Users\\admin\\Desktop\\serviceTime_text.csv")