znj 3 жил өмнө
parent
commit
24bd7d3996

+ 4 - 0
BiddingKG/dl/common/Utils.py

@@ -450,6 +450,10 @@ def getUnifyMoney(money):
                 elif len(subMoneys[0])==1:
                     if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None:
                         result += Decimal(getDigitsDic(subMoneys[0]))*(getMultipleFactor(factorUnit))
+                # subMoneys[0]中无金额单位,不可再拆分
+                elif re.search(re.compile("[%s]"%("".join(chnFactorUnits))),subMoneys[0]) is None:
+                    subMoneys[0] = subMoneys[0][0]
+                    result += Decimal(getDigitsDic(subMoneys[0])) * (getMultipleFactor(factorUnit))
                 else:
                     result += Decimal(getUnifyMoney(subMoneys[0]))*(getMultipleFactor(factorUnit))
                 if len(subMoneys)>1:

+ 17 - 16
BiddingKG/dl/interface/Preprocessing.py

@@ -2462,27 +2462,28 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
             list_person_text = [entity.entity_text for entity in list_sentence_entitys if entity.entity_type=='person']
             error_text = ['交易','机构','教育','项目','公司','中标','开标','截标','监督','政府','国家','中国','技术','投标','传真','网址','电子邮',
                           '联系','联系电','联系地','采购代','邮政编','邮政','电话','手机','手机号','联系人','地址','地点','邮箱','邮编','联系方','招标','招标人','代理',
-                          '代理人','采购','附件','注意','登录','报名','踏勘']
+                          '代理人','采购','附件','注意','登录','报名','踏勘',"测试"]
             list_person_text = set(list_person_text + error_text)
             re_person = re.compile("联系人[::]([\u4e00-\u9fa5]工)|"
                                    "联系人[::]([\u4e00-\u9fa5]{2,3})(?=联系)|"
                                    "联系人[::]([\u4e00-\u9fa5]{2,3})")
             list_person = []
-            for match_result in re_person.finditer(sentence_text):
-                match_text = match_result.group()
-                entity_text = match_text[4:]
-                wordOffset_begin = match_result.start() + 4
-                wordOffset_end = match_result.end()
-                # print(text[wordOffset_begin:wordOffset_end])
-                # 排除一些不为人名的实体
-                if re.search("^[\u4e00-\u9fa5]{7,}([,。]|$)",sentence_text[wordOffset_begin:wordOffset_begin+20]):
-                    continue
-                if entity_text not in list_person_text and entity_text[:2] not in list_person_text:
-                    _person = dict()
-                    _person['body'] = entity_text
-                    _person['begin_index'] = wordOffset_begin
-                    _person['end_index'] = wordOffset_end
-                    list_person.append(_person)
+            if not in_attachment:
+                for match_result in re_person.finditer(sentence_text):
+                    match_text = match_result.group()
+                    entity_text = match_text[4:]
+                    wordOffset_begin = match_result.start() + 4
+                    wordOffset_end = match_result.end()
+                    # print(text[wordOffset_begin:wordOffset_end])
+                    # 排除一些不为人名的实体
+                    if re.search("^[\u4e00-\u9fa5]{7,}([,。]|$)",sentence_text[wordOffset_begin:wordOffset_begin+20]):
+                        continue
+                    if entity_text not in list_person_text and entity_text[:2] not in list_person_text:
+                        _person = dict()
+                        _person['body'] = entity_text
+                        _person['begin_index'] = wordOffset_begin
+                        _person['end_index'] = wordOffset_end
+                        list_person.append(_person)
             entity_type = "person"
             for person in list_person:
                 begin_index_temp = person['begin_index']

+ 5 - 1
BiddingKG/dl/interface/extract.py

@@ -49,7 +49,7 @@ def extractCount(extract_dict):
         _extract = extract_dict
     else:
         _extract = {}
-    print(_extract)
+    # print(_extract)
     dict_pack = _extract.get("prem",{})
     extract_count = 0
     list_code = _extract.get("code",[])
@@ -203,6 +203,10 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
     list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
     cost_time["punish"] = round(time.time()-start_time,2)
 
+    '''公告无表格格式时,采购意向预测'''
+    if channel_dic['docchannel']['docchannel']=="采购意向" and len(product_attrs[1]['demand_info']['data']) == 0:
+        product_attrs = predictor.getPredictor("product_attrs").predict_without_table(product_attrs, list_sentences,
+                                                                                      list_entitys,codeName,prem,text,page_time)
     if len(product_attrs[1]['demand_info']['data'])>0:
         for d in product_attrs[1]['demand_info']['data']:
             for product in set(prem[0]['product']):

+ 19 - 12
BiddingKG/dl/interface/getAttributes.py

@@ -348,18 +348,25 @@ def get_legal_comba(list_entity,dict_role_combination):
 
 def get_dict_entity_prob(list_entity,on_value=0.5):
     dict_pack_entity_prob = {}
-    for entity in list_entity:
-        if entity.entity_type in ['org','company']:
-            values = entity.values
-            role_prob = float(values[int(entity.label)])
-            _key = entity.packageName+"$$"+str(entity.label)
-            if role_prob>=on_value and str(entity.label)!="5":
-                _key_prob = _key+"$text$"+entity.entity_text
-                if _key_prob in dict_pack_entity_prob:
-                    if role_prob>dict_pack_entity_prob[_key_prob][1]:
+    for in_attachment in [False,True]:
+        identified_role = []
+        if in_attachment==True:
+            identified_role = [value[0] for value in dict_pack_entity_prob.values()]
+        for entity in list_entity:
+            if entity.entity_type in ['org','company'] and entity.in_attachment==in_attachment:
+                values = entity.values
+                role_prob = float(values[int(entity.label)])
+                _key = entity.packageName+"$$"+str(entity.label)
+                if role_prob>=on_value and str(entity.label)!="5":
+                    _key_prob = _key+"$text$"+entity.entity_text
+                    if in_attachment == True:
+                        if entity.entity_text in identified_role:
+                            continue
+                    if _key_prob in dict_pack_entity_prob:
+                        if role_prob>dict_pack_entity_prob[_key_prob][1]:
+                            dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
+                    else:
                         dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
-                else:
-                    dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
     return dict_pack_entity_prob
 
 
@@ -828,7 +835,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                     packDict[packageName]["roleList"][i].money = money.entity_text
                     packDict[packageName]["roleList"][i].money_prob = money_prob
                     packDict[packageName]["roleList"][i].money_unit = money.money_unit
-                elif money_prob>packDict[packageName]["roleList"][i].money_prob+0.2 or money.notes in ['大写']: # 2021/7/20改为优先选择大写金额,
+                elif money_prob>packDict[packageName]["roleList"][i].money_prob+0.2 or (money.notes in ['大写'] and money.in_attachment==False): # 2021/7/20改为优先选择大写金额,
                     # print('已连接金额概率:money_prob:',packDict[packageName]["roleList"][i].money_prob)
                     # print('链接金额备注 ',money.notes, money.entity_text, money.values)
                     packDict[packageName]["roleList"][i].money = money.entity_text

+ 54 - 6
BiddingKG/dl/interface/predictor.py

@@ -1111,13 +1111,13 @@ class RoleRulePredictor():
         self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}委托))"
         self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))])|^委托|^现委托|^的\w{2,10}正在进行)"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
         self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
-        self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|集采机构|[招议))]+标机构)(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
+        self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|集采机构|[招议))]+标机构)(名称)?(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
         self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)"  # |^受托  会与 受托生产等冲突,代理表达一般会在后面有逗号
         # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
         self.pattern_winTenderer_left = "(?P<winTenderer_left>(乙|承做|施工|供货|承包|承建|竞得|受让|签约)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$|" \
                                         "(选定单位|指定的中介服务机构|实施主体|承制单位|供方)[::是为]+$|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|" \
                                         "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(供应|供货|承销|服务|实施)(机构|单位|商|方)(名称)?[::是为]+$)"
-        self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系
+        self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)(名称)?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系
         # self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
         # self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
         self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为]((采购|中标)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
@@ -1126,10 +1126,10 @@ class RoleRulePredictor():
 
         # self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)"
 
-        self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))[::是为]+$)|((评审结果|名次|排名)[::]第?[二2]名?,?投标商名称[::]+$))"
+        self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$)|((评审结果|名次|排名)[::]第?[二2]名?,?投标商名称[::]+$))"
         self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
         
-        self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))"
+        self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))"
         self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
 
         self.pattern_whole = [self.pattern_tenderee_left,
@@ -1508,7 +1508,7 @@ class TendereeRuleRecall():
                                 "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
                                 "[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
         # 未识别实体尾部判断
-        self.unrecognized_end1 = re.compile(".{2,}?(?:公司|医院|学校|学院|大学|中学|小学|幼儿园|政府|指挥部|办公室|项目部|业主大会|监狱|教育局|委员会|研究所|招标办|采购部|办事处|水利局|公墓)")
+        self.unrecognized_end1 = re.compile(".{2,}?(?:公司|医院|学校|学院|大学|中学|小学|幼儿园|政府|指挥部|办公室|项目部|业主大会|监狱|教育局|委员会|研究所|招标办|采购部|办事处|水利局|公墓|中心)")
         self.unrecognized_end2 = re.compile(".{4,}(?:署|局|厅|处|室|科|部|站|所|股|行)")
 
     def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
@@ -2254,6 +2254,12 @@ class ProductAttributesPredictor():
                             header_list2.append(col0_l[i])
                             order_time = col1_l[i].strip()
                             order_begin, order_end = self.fix_time(order_time, html, page_time)
+                    if order_begin != "" and order_end!="":
+                        order_begin_year = int(order_begin.split("-")[0])
+                        order_end_year = int(order_end.split("-")[0])
+                        # 限制附件错误识别时间
+                        if order_begin_year>=2050 or order_end_year>=2050:
+                            order_begin = order_end = ""
                     if product!= "" and demand != "" and budget!="" and order_begin != "":
                         link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
                                 'order_begin': order_begin, 'order_end': order_end}
@@ -2357,7 +2363,13 @@ class ProductAttributesPredictor():
                                         total_product_money += float(link['unitPrice'])*float(mat.group(1).replace(',', ''))
                                     except:
                                         log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
-                        if budget != "" and order_time != "" :
+                        if order_begin != "" and order_end != "":
+                            order_begin_year = int(order_begin.split("-")[0])
+                            order_end_year = int(order_end.split("-")[0])
+                            # 限制附件错误识别时间
+                            if order_begin_year >= 2050 or order_end_year >= 2050:
+                                order_begin = order_end = ""
+                        if budget != "" and order_time != "":
                             link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end}
                             if link not in demand_link:
                                 demand_link.append(link)
@@ -2374,6 +2386,42 @@ class ProductAttributesPredictor():
             demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}}
         return [attr_dic, demand_dic], total_product_money
 
+    def predict_without_table(self,product_attrs,list_sentences,list_entitys,codeName,prem, html='', page_time=""):
+        if len(prem[0]['prem'])==1:
+            list_sentence = list_sentences[0]
+            list_entity = list_entitys[0]
+            _data = product_attrs[1]['demand_info']['data']
+            re_bidding_time = re.compile("(采购时间|采购实施月份|采购月份)[::,].{0,2}$")
+            order_times = []
+            for entity in list_entity:
+                if entity.entity_type=='time':
+                    sentence = list_sentence[entity.sentence_index]
+                    s = spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index,
+                                   end_index=entity.end_index,size=20)
+                    entity_left = "".join(s[0])
+                    if re.search(re_bidding_time,entity_left):
+                        time_text = entity.entity_text.strip()
+                        standard_time = re.compile("((?P<year>\d{4}|\d{2})\s*[-\/年\.]\s*(?P<month>\d{1,2})\s*[-\/月\.]\s*(?P<day>\d{1,2})日?)")
+                        time_match = re.search(standard_time,time_text)
+                        if time_match:
+                            time_text = time_match.group()
+                        order_times.append(time_text)
+            # print(order_times)
+            order_times = [tuple(self.fix_time(order_time, html, page_time)) for order_time in order_times]
+            order_times = [order_time for order_time in order_times if order_time[0]!=""]
+            if len(set(order_times))==1:
+                order_begin,order_end = order_times[0]
+                project_name = codeName[0]['name']
+                pack_info = [pack for pack in prem[0]['prem'].values()]
+                budget = pack_info[0].get('tendereeMoney',0)
+                product = prem[0]['product']
+                link = {'project_name': project_name, 'product': product, 'demand': project_name, 'budget': budget,
+                        'order_begin': order_begin, 'order_end': order_end}
+                _data.append(link)
+            product_attrs[1]['demand_info']['data'] = _data
+        return product_attrs
+
+
 # docchannel类型提取
 class DocChannel():
   def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb'):