Преглед на файлове

Merge branch 'master' of http://192.168.2.103:3000/luojiehua/BIDI_ML_INFO_EXTRACTION

 Conflicts:
	BiddingKG/dl/interface/Preprocessing.py
	BiddingKG/dl/interface/getAttributes.py
	BiddingKG/dl_dev/test/test4.py
znj преди 2 години
родител
ревизия
6d69d90216

+ 15 - 13
BiddingKG/dl/entityLink/entityLink.py

@@ -76,17 +76,19 @@ def link_entitys(list_entitys,on_value=1):#on_value=0.81
             if _entity.entity_type in ["org","company"]:
                 range_entity.append(_entity)
         range_entity = range_entity[:1000]
-        for first_i in range(len(range_entity)):
-            _entity = range_entity[first_i]
-            for second_i in range(first_i+1,len(range_entity)):
-                _ent = range_entity[second_i]
-                # 2021/5/21 update: 两个实体标签互斥(一个是招标人、一个是代理人)且entity_text不相等时,跳过
-                if _entity.entity_text != _ent.entity_text and _entity.label != _ent.label and _entity.label in [0,1] and _ent.label in [0, 1]:
-                    continue
-                _score = jaccard_score(re.sub("%s|%s"%("股份|责任|有限|公司",place_pattern),"",_entity.entity_text), re.sub("%s|%s"%("股份|责任|有限|公司",place_pattern),"",_ent.entity_text))
-                if _entity.entity_text!=_ent.entity_text and _score>=on_value:
-                    _entity.linked_entitys.append(_ent)
-                    _ent.linked_entitys.append(_entity)
+        #替换公司的逻辑有问题,先取消
+        # for first_i in range(len(range_entity)):
+        #     _entity = range_entity[first_i]
+        #     for second_i in range(first_i+1,len(range_entity)):
+        #         _ent = range_entity[second_i]
+        #         # 2021/5/21 update: 两个实体标签互斥(一个是招标人、一个是代理人)且entity_text不相等时,跳过
+        #         if _entity.entity_text != _ent.entity_text and _entity.label != _ent.label and _entity.label in [0,1] and _ent.label in [0, 1]:
+        #             continue
+        #         _score = jaccard_score(re.sub("%s|%s"%("股份|责任|有限|公司",place_pattern),"",_entity.entity_text), re.sub("%s|%s"%("股份|责任|有限|公司",place_pattern),"",_ent.entity_text))
+        #         if _entity.entity_text!=_ent.entity_text and _score>=on_value:
+        #             _entity.linked_entitys.append(_ent)
+        #             _ent.linked_entitys.append(_entity)
+        #             print("=-===",_entity.entity_text,_ent.entity_text,_score)
         #替换公司名称
         for _entity in range_entity:
             if re.search("公司",_entity.entity_text) is None:
@@ -433,6 +435,6 @@ if __name__=="__main__":
     # print(match_enterprise_max_first(sentences))
     #
     # print("takes %d s"%(time.time()-_time))
-    fix_LEGAL_ENTERPRISE()
-    # print(jaccard_score("中国南方航空股份有限公司上海分公司","南方航空上海分公司"))
+    # fix_LEGAL_ENTERPRISE()
+    print(jaccard_score("吉林省九台","吉林省建苑设计集团有限公司"))
     # print(match_enterprise_max_first("中国南方航空股份有限公司黑龙江分公司"))

BIN
BiddingKG/dl/industry/model/channel_foolcut_doc_type_withoutEmb.ckpt.data-00000-of-00001


BIN
BiddingKG/dl/industry/model/channel_foolcut_doc_type_withoutEmb.ckpt.index


BIN
BiddingKG/dl/industry/model/channel_foolcut_doc_type_withoutEmb.ckpt.meta


+ 2 - 0
BiddingKG/dl/industry/model/checkpoint

@@ -0,0 +1,2 @@
+model_checkpoint_path: "channel_foolcut_doc_type_withoutEmb.ckpt"
+all_model_checkpoint_paths: "channel_foolcut_doc_type_withoutEmb.ckpt"

+ 147 - 16
BiddingKG/dl/interface/Preprocessing.py

@@ -343,7 +343,7 @@ def tableToText(soup):
             same_value = inner_table[h][0][0]
             for w in range(width):
                 if last_head is not None:
-                    if inner_table[h-1][w][0]!=fix_value and inner_table[h-1][w][1] == 0:
+                    if inner_table[h-1][w][0] != fix_value and inner_table[h-1][w][0] != "" and inner_table[h-1][w][1] == 0:
                         is_all_key = False
 
                     if inner_table[h][w][0]==1:
@@ -367,13 +367,16 @@ def tableToText(soup):
                 continue
 
             if is_same_value:
-                head_list.append(h)
-                last_is_same_value = is_same_value
-                continue
+                # 该块只有表头一行不合法
+                if h - head_list[-1] > 1:
+                    head_list.append(h)
+                    last_is_same_value = is_same_value
+                    continue
             if not is_all_key:
                 if not is_same_with_lastHead:
-                    head_list.append(h)
-
+                    # 该块只有表头一行不合法
+                    if h - head_list[-1] > 1:
+                        head_list.append(h)
 
         head_list.append(height)
         return head_list
@@ -420,7 +423,7 @@ def tableToText(soup):
         return inner_table,head_list
 
     def set_head_model(inner_table):
-        copy_inner_table = copy.deepcopy(inner_table)
+        origin_inner_table = copy.deepcopy(inner_table)
         for i in range(len(inner_table)):
             for j in range(len(inner_table[i])):
                 # 删掉单格前后符号,以免影响表头预测
@@ -435,7 +438,7 @@ def tableToText(soup):
         # 组合结果
         for i in range(len(inner_table)):
             for j in range(len(inner_table[i])):
-                inner_table[i][j] = [copy_inner_table[i][j][0], int(predict_list[i][j])]
+                inner_table[i][j] = [origin_inner_table[i][j][0], int(predict_list[i][j])]
         head_list = sliceTable(inner_table)
         return inner_table, head_list
 
@@ -670,7 +673,7 @@ def tableToText(soup):
         # packPattern = "(标包|[标包][号段名])"
         packPattern = "(标包|标的|[标包][号段名]|((项目|物资|设备|场次|标段|标的|产品)(名称)))"  # 2020/11/23 大网站规则,补充采购类包名
         rankPattern = "(排名|排序|名次|序号|评标结果|评审结果|是否中标|推荐意见)"  # 2020/11/23 大网站规则,添加序号为排序
-        entityPattern = "((候选|([中投]标|报价))(单位|公司|人|供应商))"
+        entityPattern = "((候选|[中投]标|报价)(单位|公司|人|供应商))|供应商名称"
         moneyPattern = "([中投]标|报价)(金额|价)"
         height = len(inner_table)
         width = len(inner_table[0])
@@ -1013,6 +1016,7 @@ def tableToText(soup):
             inner_table, head_list = set_head_model(inner_table)
             # inner_table,head_list = setHead_incontext(inner_table,pat_head)
             # print("table_head", inner_table)
+            # print("head_list", head_list)
             # for begin in range(len(head_list[:-1])):
             #     for item in inner_table[head_list[begin]:head_list[begin+1]]:
             #         print(item)
@@ -1125,6 +1129,8 @@ def get_preprocessed_outline(soup):
     pattern_list = [pattern_0, pattern_1, pattern_2, pattern_3]
 
     body = soup.find("body")
+    if body == None:
+        return soup  # 修复 无body的报错 例子:264419050
     body_child = body.find_all(recursive=False)
     deal_part = body
     # print(body_child[0]['id'])
@@ -1781,6 +1787,9 @@ def special_treatment(sourceContent, web_source_no):
             ser = re.search('支付金额:', sourceContent)
             if ser:
                 sourceContent = sourceContent.replace('支付金额:', '合同金额:')
+        elif web_source_no=='00811-8':
+            if re.search('是否中标:是', sourceContent) and re.search('排名:\d,', sourceContent):
+                sourceContent = re.sub('排名:\d,', '候选', sourceContent)
         return sourceContent
     except Exception as e:
         log('特殊数据源: %s 预处理特别修改抛出异常: %s'%(web_source_no, e))
@@ -1905,6 +1914,121 @@ def attachment_filelink(soup):
         # print('格式化输出',soup.prettify())
         return soup
 
+def del_achievement(text):
+    if re.search('中标|成交|入围|结果|评标|开标|候选人', text[:500]) == None or re.search('业绩', text) == None:
+        return text
+    p0 = '[,。;]((\d{1,2})|\d{1,2}、)[\w、]{,8}:|((\d{1,2})|\d{1,2}、)|。' # 例子 264392818
+    p1 = '业绩[:,](\d、[-\w()、]{6,30}(工程|项目|勘察|设计|施工|监理|总承包|采购|更新)[\w()]{,10}[,;])+' # 例子 257717618
+    p2 = '(类似业绩情况:|业绩:)(\w{,20}:)?(((\d)|\d、)项目名称:[-\w(),;、\d\s:]{5,100}[;。])+' # 例子 264345826
+    p3 = '(投标|类似|(类似)?项目|合格|有效|企业|工程)?业绩(名称|信息|\d)?:(项目名称:)?[-\w()、]{6,50}(项目|工程|勘察|设计|施工|监理|总承包|采购|更新)'
+
+    l = []
+    tmp = []
+    for it in re.finditer(p0, text):
+        if it.group(0)[-3:] in ['业绩:', '荣誉:']:
+            if tmp != []:
+                del_text = text[tmp[0]:it.start()]
+                l.append(del_text)
+                tmp = []
+            tmp.append(it.start())
+        elif tmp != []:
+            del_text = text[tmp[0]:it.start()]
+            l.append(del_text)
+            tmp = []
+    if tmp != []:
+        del_text = text[tmp[0]:]
+        l.append(del_text)
+    for del_text in l:
+        text = text.replace(del_text, '')
+        # print('删除业绩信息:', del_text)
+
+    for rs in re.finditer(p1, text):
+        # print('删除业绩信息:', rs.group(0))
+        text = text.replace(rs.group(0), '')
+
+    for rs in re.finditer(p2, text):
+        # print('删除业绩信息:', rs.group(0))
+        text = text.replace(rs.group(0), '')
+
+    for rs in re.finditer(p3, text):
+        # print('删除业绩信息:', rs.group(0))
+        text = text.replace(rs.group(0), '')
+    return text
+
+def del_tabel_achievement(soup):
+    if re.search('中标|成交|入围|结果|评标|开标|候选人', soup.text[:800]) == None or re.search('业绩', soup.text)==None:
+        return None
+    p1 = '中标(单位|候选人)的?(企业|项目|项目负责人|\w{,5})?业绩|类似(项目)?业绩|\w{,10}业绩$|业绩(公示|情况|荣誉)'
+    '''删除前面标签 命中业绩规则;当前标签为表格且公布业绩相关信息的去除'''
+    for tag in soup.find_all('table'):
+        pre_text = tag.findPreviousSibling().text.strip() if tag.findPreviousSibling() != None else ""
+        tr_text = tag.find('tr').text.strip() if tag.find('tr') != None else ""
+        #     print(re.search(p1, pre_text),pre_text, len(pre_text), re.findall('序号|中标候选人名称|项目名称|工程名称|合同金额|建设单位|业主', tr_text))
+        if re.search(p1, pre_text) and len(pre_text) < 20 and tag.find('tr') != None and len(tr_text)<100:
+            _count = 0
+            for td in tag.find('tr').find_all('td'):
+                td_text = td.text.strip()
+                if len(td_text) > 25:
+                    break
+                if len(td_text) < 25 and re.search('中标候选人|(项目|业绩|工程)名称|\w{,10}业绩$|合同金额|建设单位|采购单位|业主|甲方', td_text):
+                    _count += 1
+                if _count >=2:
+                    pre_tag = tag.findPreviousSibling().extract()
+                    del_tag = tag.extract()
+                    # print('删除表格业绩内容', pre_tag.text + del_tag.text)
+                    break
+        elif re.search('业绩名称', tr_text) and re.search('建设单位|采购单位|业主', tr_text) and len(tr_text)<100:
+            del_tag = tag.extract()
+            # print('删除表格业绩内容', del_tag.text)
+    del_trs = []
+    '''删除表格某些行公布的业绩信息'''
+    for tag in soup.find_all('table'):
+        text = tag.text
+        if re.search('业绩', text) == None:
+            continue
+        # for tr in tag.find_all('tr'):
+        trs = tag.find_all('tr')
+        i = 0
+        while i < len(trs):
+            tr = trs[i]
+            if len(tr.find_all('td'))==2 and tr.td!=None and tr.td.findNextSibling()!=None:
+                td1_text =tr.td.text
+                td2_text =tr.td.findNextSibling().text
+                if re.search('业绩', td1_text)!=None and len(td1_text)<10 and len(re.findall('(\d、|(\d))?[-\w()、]+(工程|项目|勘察|设计|施工|监理|总承包|采购|更新)', td2_text))>=2:
+                    # del_tag = tr.extract()
+                    # print('删除表格业绩内容', del_tag.text)
+                    del_trs.append(tr)
+            elif tr.td != None and re.search('^业绩|业绩$', tr.td.text.strip()) and len(tr.td.text.strip())<25:
+                rows = tr.td.attrs.get('rowspan', '')
+                cols = tr.td.attrs.get('colspan', '')
+                if rows.isdigit() and int(rows)>2:
+                    for j in range(int(rows)):
+                        if i+j < len(trs):
+                            del_trs.append(trs[i+j])
+                    i += j
+                elif cols.isdigit() and int(cols)>3 and len(tr.find_all('td'))==1 and i+2 < len(trs):
+                    next_tr_cols = 0
+                    td_num = 0
+                    for td in trs[i+1].find_all('td'):
+                        td_num += 1
+                        if td.attrs.get('colspan', '').isdigit():
+                            next_tr_cols += int(td.attrs.get('colspan', ''))
+                    if next_tr_cols == int(cols):
+                        del_trs.append(tr)
+                        for j in range(1,len(trs)-i):
+                            if len(trs[i+j].find_all('td')) == 1:
+                                break
+                            elif len(trs[i+j].find_all('td')) >= td_num-1:
+                                del_trs.append(trs[i+j])
+                            else:
+                                break
+                        i += j
+            i += 1
+        for tr in del_trs:
+            del_tag = tr.extract()
+            # print('删除表格业绩内容', del_tag.text)
+
+
 def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
     '''
     :param articles: 待处理的article source html
@@ -1939,6 +2063,10 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         start_time = time.time()
         # article_processed = tableToText(BeautifulSoup(sourceContent,"lxml"))
         article_processed = BeautifulSoup(sourceContent,"lxml")
+
+        '''表格业绩内容删除'''
+        del_tabel_achievement(article_processed)
+
         '''特别数据源对 BeautifulSoup(html) 做特别修改'''
         if web_source_no in ["00753-14","DX008357-11","18021-2"]:
             article_processed = special_treatment(article_processed, web_source_no)
@@ -1965,7 +2093,10 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = re.sub('任务(?=编号[::])', '项目',article_processed)  # 2022/08/10 修正为项目编号
         article_processed = article_processed.replace('招标(建设)单位', '招标单位')  #2022/8/10 修正预测不到表达
         article_processed = re.sub("采购商(?=[^\u4e00-\u9fa5]|名称)", "招标人", article_processed)
-        article_processed = re.sub('(招标|采购)人(概况|信息)[,。]', '采购人信息:', article_processed)  # 2022/8/10统一表达
+        article_processed = re.sub('(招标|采购)人(概况|信息):?[,。]', '采购人信息:', article_processed)  # 2022/8/10统一表达
+
+        '''去除业绩内容'''
+        article_processed = del_achievement(article_processed)
         # 修复OCR金额中“,”、“。”识别错误
         article_processed_list = article_processed.split("##attachment##")
         if len(article_processed_list)>1:
@@ -1980,7 +2111,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
             article_processed_list[1] = attachment_text
             article_processed = "##attachment##".join(article_processed_list)
         '''特别数据源对 预处理后文本 做特别修改'''
-        if web_source_no in ['03786-10', '00076-4', 'DX000105-2', '04080-3', '04080-4', '03761-3', '00695-7',"13740-2"]:
+        if web_source_no in ['03786-10', '00076-4', 'DX000105-2', '04080-3', '04080-4', '03761-3', '00695-7',"13740-2", '00811-8']:
             article_processed = special_treatment(article_processed, web_source_no)
 
         # 提取bidway
@@ -2171,7 +2302,7 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
             list_sentences_temp.append(Sentences(doc_id=doc_id,sentence_index=0,sentence_text="sentence_text",tokens=[],pos_tags=[],ner_tags=""))
         list_sentences.append(list_sentences_temp)
         list_outlines.append(outline_list)
-        article.content = re.sub("##attachment_begin##|##attachment_end##","",article.content)
+        article.content = re.sub("##attachment_begin##|##attachment_end##", "", article.content)
     return list_sentences,list_outlines
 
 def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
@@ -2234,7 +2365,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
             '''正则识别角色实体  经营部|经销部|电脑部|服务部|复印部|印刷部|彩印部|装饰部|修理部|汽修部|修理店|零售店|设计店|服务店|家具店|专卖店|分店|文具行|商行|印刷厂|修理厂|维修中心|修配中心|养护中心|服务中心|会馆|文化馆|超市|门市|商场|家具城|印刷社|经销处'''
             for it in re.finditer(
-                    '(?P<text_key_word>(((单一来源|中标|中选|中价|成交)(供应商|供货商|服务商|候选人|单位|人))|(供应商|供货商|服务商|候选人))(名称)?[为::]+)(?P<text>([^,。、;《::]{5,20})(厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处))[,。]',
+                    '(?P<text_key_word>(((单一来源|中标|中选|中价|成交)(供应商|供货商|服务商|候选人|单位|人))|(供应商|供货商|服务商|候选人))(名称)?[为::]+)(?P<text>([()\w]{5,20})(厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处))[,。]',
                     sentence_text):
                 for k, v in it.groupdict().items():
                     if k == 'text_key_word':
@@ -2250,7 +2381,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                     ner_entitys.append((b, e, 'company', entity))
 
             for it in re.finditer(
-                    '(?P<text_key_word>((建设|招租|招标|采购)(单位|人)|业主)(名称)?[为::]+)(?P<text>\w{2,4}[省市县区镇]([^,。、;《]{2,20})(管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园))[,。]',
+                    '(?P<text_key_word>((建设|招租|招标|采购)(单位|人)|业主)(名称)?[为::]+)(?P<text>\w{2,4}[省市县区镇]([()\w]{2,20})(管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园))[,。]',
                     sentence_text):
                 for k, v in it.groupdict().items():
                     if k == 'text_key_word':
@@ -2494,10 +2625,10 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
                     entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",entity_text)
                     # print('转换前金额:', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
-                    if re.search('总投资|投资总额|总预算|总概算|投资规模', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额
+                    if re.search('总投资|投资总额|总预算|总概算|投资规模|批复概算', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额
                         # print('总投资金额: ', _match.group(0))
                         notes = '总投资'
-                    elif re.search('投资', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/11/18 投资金额不作为招标金额
+                    elif re.search('投资|概算', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/11/18 投资金额不作为招标金额
                         notes = '投资'
                     elif re.search('工程造价', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/12/20 工程造价不作为招标金额
                         notes = '工程造价'

BIN
BiddingKG/dl/interface/district_dic.pkl


+ 44 - 7
BiddingKG/dl/interface/extract.py

@@ -106,7 +106,7 @@ def extractCount(extract_dict):
         extract_count += 1
     return extract_count
 
-def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchannel='',**kwargs):
+def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',**kwargs):
     cost_time = dict()
 
     start_time = time.time()
@@ -148,7 +148,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
     predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
     cost_time["rule"] = round(time.time()-start_time,2)
 
-    '''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
+    '''正则补充最后一句实体日期格式为招标或代理 2021/12/30;正则最后补充角色及去掉包含 公共资源交易中心 的招标人'''
     start_time = time.time() #正则角色提取
     predictor.getPredictor("roleRuleFinal").predict(list_articles,list_sentences,list_entitys, codeName)
     cost_time["roleRuleFinal"] = round(time.time()-start_time,2)
@@ -157,6 +157,16 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
     predictor.getPredictor("tendereeRuleRecall").predict(list_articles,list_sentences,list_entitys, codeName)
     cost_time["tendereeRuleRecall"] = round(time.time()-start_time,2)
 
+    '''规则调整角色概率'''
+    start_time = time.time() #
+    predictor.getPredictor("rolegrade").predict(list_sentences,list_entitys)
+    cost_time["rolegrade"] = round(time.time()-start_time,2)
+
+    '''规则调整金额概率'''
+    start_time = time.time() #
+    predictor.getPredictor("moneygrade").predict(list_sentences,list_entitys)
+    cost_time["moneygrade"] = round(time.time()-start_time,2)
+
     start_time = time.time() #联系人模型提取
     predictor.getPredictor("epc").predict(list_sentences,list_entitys)
     log("get epc done of doc_id%s"%(doc_id))
@@ -189,9 +199,13 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
     log("get attributes done of doc_id%s"%(doc_id))
     cost_time["attrs"] = round(time.time()-start_time,2)
 
-    start_time = time.time() #失信数据要素提取
-    list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
-    cost_time["punish"] = round(time.time()-start_time,2)
+    '''获取联合体信息'''
+    getAttributes.get_win_joint(prem, list_entitys, list_sentences, list_articles)
+
+    #暂时不执行
+    # start_time = time.time() #失信数据要素提取
+    # list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
+    # cost_time["punish"] = round(time.time()-start_time,2)
 
 
     '''修正采购公告表格形式多种采购产品中标价格;中标金额小于所有产品总金额则改为总金额'''
@@ -202,6 +216,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
     # content = list_articles[0].content
     # channel_dic = predictor.getPredictor("channel").predict_rule(title, content, channel_dic, prem_dic=prem[0]['prem'])
     channel_dic, msc = predictor.getPredictor("channel").predict_merge(title,list_sentences[0], text, list_articles[0].bidway, prem[0], original_docchannel)
+    # print('msc', msc)
     cost_time["rule_channel"] = round(time.time()-start_time,2)
 
     start_time = time.time() # 产品名称及废标原因提取  #依赖 docchannel结果
@@ -219,10 +234,18 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
     '''行业分类提取,需要用标题、项目名称、产品、及prem 里面的角色'''
     industry = predictor.getPredictor('industry').predict(title, project=codeName[0]['name'], product=','.join(product_list), prem=prem)
 
+    '''地区获取'''
+    start_time = time.time()
+    district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name)
+    cost_time["district"] = round(time.time() - start_time, 2)
+
+    '''限制行业最高金额'''
+    getAttributes.limit_maximum_amount(prem, industry)
+
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry)
+    data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district)
     data_res["doctitle_refine"] = doctitle_refine
     data_res["nlp_enterprise"] = nlp_enterprise
     data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment
@@ -241,7 +264,9 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
     #         log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
     #               (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
     #                str(_entity.begin_index),str(_entity.end_index)))
-    return json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)#, list_articles[0].content, list_entitys[0]
+    _extract_json = json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
+    _extract_json = _extract_json.replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '')
+    return _extract_json#, list_articles[0].content, get_ent_context(list_sentences, list_entitys)
 
 
 def test(name,content):
@@ -255,6 +280,18 @@ def test(name,content):
     # print(resp_json)
     return resp_json
 
+def get_ent_context(list_sentences, list_entitys):
+    rs_list = []
+    sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
+    for list_entity in list_entitys:
+        for _entity in list_entity:
+            if _entity.entity_type in ['org', 'company', 'money']:
+                s = sentences[_entity.sentence_index].sentence_text
+                b = _entity.wordOffset_begin
+                e = _entity.wordOffset_end
+                # print("%s %d %.4f; %s  %s  %s"%(_entity.entity_type, _entity.label, _entity.values[_entity.label], s[max(0, b-10):b], _entity.entity_text, s[e:e+10]))
+                rs_list.append("%s %d %.4f; %s ## %s ## %s"%(_entity.entity_type, _entity.label, _entity.values[_entity.label], s[max(0, b-10):b], _entity.entity_text, s[e:e+10]))
+    return '\n'.join(rs_list)
 
 if __name__=="__main__":
     import pandas as pd

+ 108 - 8
BiddingKG/dl/interface/getAttributes.py

@@ -364,10 +364,10 @@ def get_dict_entity_prob(list_entity,on_value=0.5):
                         if entity.entity_text in identified_role:
                             continue
                     if _key_prob in dict_pack_entity_prob:
-                        new_prob = role_prob+dict_pack_entity_prob[_key_prob][1]
-                        dict_pack_entity_prob[_key_prob] = [entity.entity_text, new_prob] #公司同角色多次出现概率累计
-                        # if role_prob>dict_pack_entity_prob[_key_prob][1]:
-                        #     dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
+                        # new_prob = role_prob+dict_pack_entity_prob[_key_prob][1] if role_prob>0.9 else max(role_prob, dict_pack_entity_prob[_key_prob][1])
+                        # dict_pack_entity_prob[_key_prob] = [entity.entity_text, new_prob] #公司同角色多次出现概率累计
+                        if role_prob>dict_pack_entity_prob[_key_prob][1]:
+                            dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
                     else:
                         dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
     return dict_pack_entity_prob
@@ -575,7 +575,7 @@ def getPackagesFromArticle(list_sentence,list_entity):
     
     package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]?([^::]{3,30}?),{1}")
     package_N_name_pattern = re.compile("(([^承]|^)分?包|标段|标包|标|包|包组|子项目|包件|项目类型)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]){1,2},{1}")
-    package_number_pattern = re.compile("(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))")  # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
+    package_number_pattern = re.compile("(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|([^\.]?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(标[段号的包]))")  # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
     # other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[::](.{,20}?)(,|项目)')  # 新正则识别标段
     other_package_pattern = re.compile('((项目|物资|设备|场次|标段|标的|产品)(名称)?)[::]([^,。]{2,50}?)[,。]')  #  # 2020/11/23 大网站规则 调整  package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
     win_tenderer_pattern = re.compile('(中标候?选?人|供应商)(名称)?[::](.{2,25})[,。]')  # 2020/11/23 大网站规则 调整
@@ -662,6 +662,8 @@ def getPackagesFromArticle(list_sentence,list_entity):
                     dict_packageCode[temp_package_number] = code
                 PackageSet.add(temp_package_number)
         for iter in re.finditer(package_number_pattern,content):
+            if re.match('\d', iter.group(0)) and iter.end()<len(content) and content[iter.end()].isdigit():  # 排除2.10标段3 这种情况
+                continue
             temp_package_number = re.findall(number_pattern,content[iter.span()[0]:iter.span()[1]])[0]
             if re.search(re_digital, temp_package_number):
                 temp_package_number = str(int(temp_package_number))
@@ -3017,8 +3019,7 @@ def getPREMs(list_sentences,list_entitys,list_articles,list_outlines):
         #                       "attachmentTypes":list_article.attachmentTypes, "bidway": list_article.bidway}))
     return result
 
-def correct_rolemoney(prem, total_product_money):
-    print('total_product_money',total_product_money)
+def correct_rolemoney(prem, total_product_money): # 2022/9/26修改为 中标金额小于表格单价数量合计总金额十分之一时替换
     if total_product_money>0 and len(prem[0]['prem'])==1:
         for value in prem[0]['prem'].values():
             for l in value['roleList']:
@@ -3026,12 +3027,111 @@ def correct_rolemoney(prem, total_product_money):
                     # if l[0] == 'win_tenderer' and float(l[2])<total_product_money:
                     #     l[2] = total_product_money
                     #     log('修改中标金额为所有产品总金额')
-                    if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money'])<total_product_money:
+                    if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money'])<total_product_money/10:
                         l["role_money"]['money'] = total_product_money
                         # log('修改中标金额为所有产品总金额')
                 except Exception as e:
                     print('表格产品价格修正中标价格报错:%s'%e)
 
+def limit_maximum_amount(prem, industry):
+    indu = industry['industry'].get('class_name', '')
+    indu_amount = {
+        '计算机设备': 200000000,
+        '办公设备': 100000000,
+        '家具用具': 500000000,
+        '办公消耗用品及类似物品': 100000000,
+        '日杂用品': 100000000,
+        '餐饮业': 1000000000,
+        '物业管理': 1000000000,
+        '工程技术与设计服务': 1000000000,
+        '工程评价服务': 100000000,
+        '其他工程服务': 100000000,
+        '工程监理服务': 100000000,
+        '工程造价服务': 100000000
+    }
+    if indu in indu_amount:
+        maximum_amount = indu_amount[indu]
+        try:
+            for value in prem[0]['prem'].values():
+                for l in value['roleList']:
+                    if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money']) > maximum_amount:
+                        if indu in ['餐饮业', '物业管理']:
+                            l["role_money"]['money'] = str(float(l["role_money"]['money'])/10000)
+                        elif l["role_money"]['money_unit'] == '万元':
+                            l["role_money"]['money'] = str(float(l["role_money"]['money'])/10000)
+                if float(value['tendereeMoney']) > maximum_amount:
+                    if indu in ['餐饮业', '物业管理']:
+                        value['tendereeMoney'] = float(value['tendereeMoney'])/10000
+                    elif value['tendereeMoneyUnit'] == '万元':
+                        value['tendereeMoney'] = float(value['tendereeMoney']) / 10000
+        except Exception as e:
+            print('行业分类限制最高金额抛出异常:%s' % e)
+
+def get_win_joint(prem, list_entitys, list_sentences, list_articles):
+    '''
+    获取联合体信息, 添加到prem
+    :param prem:
+    :param list_entitys:
+    :param list_sentences:
+    :param list_articles:
+    :return:
+    '''
+    try:
+        if 'win_tenderer' in str(prem) and re.search('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|(联合体)|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)', list_articles[0].content):
+            sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
+            for project in prem[0].values():
+                if not isinstance(project, dict):
+                    continue
+                for v in project.values():
+                    for d in v['roleList']:
+                        if d.get('role_name', '') == 'win_tenderer':
+                            winner = d.get('role_text')
+                            join_l = [winner]
+                            for list_entity in list_entitys:
+                                for i in range(len(list_entity)-1):
+                                    _entity = list_entity[i]
+                                    b = _entity.wordOffset_begin
+                                    e = _entity.wordOffset_end
+                                    if _entity.entity_type in ['org', 'company'] and _entity.label==2\
+                                            and _entity.entity_text==winner:
+                                        s = sentences[_entity.sentence_index].sentence_text
+                                        for j in range(i+1, len(list_entity)):
+                                            behind_entity = list_entity[j]
+                                            b2 = behind_entity.wordOffset_begin
+                                            e2 = behind_entity.wordOffset_end
+                                            if _entity.sentence_index == behind_entity.sentence_index and behind_entity.entity_type in ['org', 'company'] \
+                                                    and b2-e<10 and re.search('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:', s[b2-e:b2]) or \
+                                                re.search('(联合体)|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)', s[e2:e2+10]):
+                                                join_l.append(behind_entity.entity_text)
+                                                b = b2
+                                                e = e2
+                                            else:
+                                                break
+                                        if len(join_l)>1:
+                                            d['win_tenderer_joint'] = ','.join(join_l)
+
+
+
+                                            # behind_entity = list_entity[i + 1]
+                                    # if _entity.sentence_index== behind_entity.sentence_index and _entity.entity_type in ['org', 'company'] and _entity.label==2\
+                                    #         and _entity.entity_text==winner and behind_entity.entity_type in ['org', 'company'] and behind_entity.label==5:
+                                    #     s = sentences[_entity.sentence_index].sentence_text
+                                    #     b = _entity.wordOffset_begin
+                                    #     e = _entity.wordOffset_end
+                                    #     b2 = behind_entity.wordOffset_begin
+                                    #     e2 = behind_entity.wordOffset_end
+                                        # if re.search('(联合体)', s[e2:e2+6]) and b2-e<3:
+                                        #     print('联合体:', s[max(0, b-10):e2+10])
+                                        #     d['win_tenderer_joint'] = '%s,%s'%(_entity.entity_text, behind_entity.entity_text)
+                                        #     break
+                                        # elif re.search('(联合体((牵头|主办)(人|方|单位)|主体)|牵头(人|方|单位))|(联合体)?成员:|特殊普通合伙:', s[e:b2]) and b2-e<10:
+                                        #     d['win_tenderer_joint'] = '%s,%s' % (_entity.entity_text, behind_entity.entity_text)
+                                        #     print('联合体:', s[max(0, b - 10):e2 + 10])
+                                        #     break
+    except Exception as e:
+        print('获取联合体抛出异常', e)
+
+
 if __name__=="__main__":
     '''
     conn = getConnection()

+ 10 - 10
BiddingKG/dl/interface/modelFactory.py

@@ -41,7 +41,7 @@ class Model_role_classify():
             return self.getModel().predict([x[0],x[1]])
     
 class Model_role_classify_word():
-    def __init__(self,lazyLoad=getLazyLoad()):
+    def __init__(self,lazyLoad=getLazyLoad(),config=None):
         if USE_PAI_EAS:
             lazyLoad = True
         #self.model_role_file = os.path.abspath("../role/log/ep071-loss0.107-val_loss0.122-f10.956.h5")
@@ -49,7 +49,7 @@ class Model_role_classify_word():
         #self.model_role_file = os.path.abspath("../role/log/textcnn_ep017-loss0.088-val_loss0.125-f10.955.h5")
         self.model_role = None
         
-        self.sess_role = tf.Session(graph=tf.Graph())
+        self.sess_role = tf.Session(graph=tf.Graph(),config=config)
         if not lazyLoad:
             self.getModel()
         
@@ -94,12 +94,12 @@ class Model_role_classify_word():
         
     
 class Model_money_classify():
-    def __init__(self,lazyLoad=getLazyLoad()):
+    def __init__(self,lazyLoad=getLazyLoad(),config=None):
         if USE_PAI_EAS:
             lazyLoad = True
         self.model_money_file = os.path.dirname(__file__)+"/../money/models/model_money_word.h5"
         self.model_money = None
-        self.sess_money = tf.Session(graph=tf.Graph())
+        self.sess_money = tf.Session(graph=tf.Graph(),config=config)
         if not lazyLoad:
             self.getModel()
         
@@ -345,12 +345,12 @@ class Model_relation_extraction():
 
     
 class Model_person_classify():
-    def __init__(self,lazyLoad=getLazyLoad()):
+    def __init__(self,lazyLoad=getLazyLoad(),config=None):
         if USE_PAI_EAS:
             lazyLoad = True
         self.model_person_file = os.path.dirname(__file__)+"/../person/models/model_person.model.hdf5"
         self.model_person = None
-        self.sess_person = tf.Session(graph=tf.Graph())
+        self.sess_person = tf.Session(graph=tf.Graph(),config=config)
         if not lazyLoad:
             self.getModel()
         
@@ -436,10 +436,10 @@ class Model_form_line():
             return self.getModel().predict(x)
     
 class Model_form_item():
-    def __init__(self,lazyLoad=getLazyLoad()):
+    def __init__(self,lazyLoad=getLazyLoad(),config=None):
         self.model_file = os.path.dirname(__file__)+"/../form/log/ep039-loss0.038-val_loss0.064-f10.9783.h5"
         self.model_form = None
-        self.sess_form = tf.Session(graph=tf.Graph())
+        self.sess_form = tf.Session(graph=tf.Graph(),config=config)
         if not lazyLoad:
             self.getModel()
 
@@ -485,9 +485,9 @@ class Model_form_item():
         '''
 
 class Model_form_context():
-    def __init__(self,lazyLoad=getLazyLoad()):
+    def __init__(self,lazyLoad=getLazyLoad(),config=None):
         self.model_form = None
-        self.sess_form = tf.Session(graph=tf.Graph())
+        self.sess_form = tf.Session(graph=tf.Graph(),config=config)
         if not lazyLoad:
             self.getModel()
 

+ 360 - 82
BiddingKG/dl/interface/predictor.py

@@ -17,6 +17,7 @@ sys.path.append(os.path.abspath("../.."))
 from BiddingKG.dl.common.Utils import *
 from BiddingKG.dl.interface.modelFactory import *
 import tensorflow as tf
+import pandas as pd
 from BiddingKG.dl.product.data_util import decode, process_data
 from BiddingKG.dl.interface.Entitys import Entity
 from BiddingKG.dl.complaint.punish_predictor import Punish_Extract
@@ -27,6 +28,13 @@ import calendar
 import datetime
 # import fool   # 统一用 selffool ,阿里云上只有selffool 包
 
+cpu_num = int(os.environ.get("CPU_NUM",0))
+sess_config = tf.ConfigProto(
+                        inter_op_parallelism_threads = cpu_num,
+                        intra_op_parallelism_threads = cpu_num,
+                        log_device_placement=True)
+sess_config = None
+
 from threading import RLock
 dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
               "prem":{"predictor":None,"Lock":RLock()},
@@ -42,7 +50,10 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
                   "channel": {"predictor": None, "Lock": RLock()},
                   "deposit_payment_way": {"predictor": None, "Lock": RLock()},
                   "total_unit_money": {"predictor": None, "Lock": RLock()},
-                  "industry": {"predictor": None, "Lock": RLock()}
+                  "industry": {"predictor": None, "Lock": RLock()},
+                  "rolegrade": {"predictor": None, "Lock": RLock()},
+                  "moneygrade": {"predictor": None, "Lock": RLock()},
+                  "district": {"predictor": None, "Lock": RLock()}
                   }
 
 
@@ -51,11 +62,11 @@ def getPredictor(_type):
         with dict_predictor[_type]["Lock"]:
             if dict_predictor[_type]["predictor"] is None:
                 if _type == "codeName":
-                    dict_predictor[_type]["predictor"] = CodeNamePredict()
+                    dict_predictor[_type]["predictor"] = CodeNamePredict(config=sess_config)
                 if _type == "prem":
-                    dict_predictor[_type]["predictor"] = PREMPredict()
+                    dict_predictor[_type]["predictor"] = PREMPredict(config=sess_config)
                 if _type == "epc":
-                    dict_predictor[_type]["predictor"] = EPCPredict()
+                    dict_predictor[_type]["predictor"] = EPCPredict(config=sess_config)
                 if _type == "roleRule":
                     dict_predictor[_type]["predictor"] = RoleRulePredictor()
                 if _type == "roleRuleFinal":
@@ -63,23 +74,29 @@ def getPredictor(_type):
                 if _type == "tendereeRuleRecall":
                     dict_predictor[_type]["predictor"] = TendereeRuleRecall()
                 if _type == "form":
-                    dict_predictor[_type]["predictor"] = FormPredictor()
+                    dict_predictor[_type]["predictor"] = FormPredictor(config=sess_config)
                 if _type == "time":
-                    dict_predictor[_type]["predictor"] = TimePredictor()
+                    dict_predictor[_type]["predictor"] = TimePredictor(config=sess_config)
                 if _type == "punish":
                     dict_predictor[_type]["predictor"] = Punish_Extract()
                 if _type == "product":
-                    dict_predictor[_type]["predictor"] = ProductPredictor()
+                    dict_predictor[_type]["predictor"] = ProductPredictor(config=sess_config)
                 if _type == "product_attrs":
                     dict_predictor[_type]["predictor"] = ProductAttributesPredictor()
                 if _type == "channel":
-                    dict_predictor[_type]["predictor"] = DocChannel()
+                    dict_predictor[_type]["predictor"] = DocChannel(config=sess_config)
                 if _type == 'deposit_payment_way':
                     dict_predictor[_type]["predictor"] = DepositPaymentWay()
                 if _type == 'total_unit_money':
                     dict_predictor[_type]["predictor"] = TotalUnitMoney()
                 if _type == 'industry':
                     dict_predictor[_type]["predictor"] = IndustryPredictor()
+                if _type == 'rolegrade':
+                    dict_predictor[_type]["predictor"] = RoleGrade()
+                if _type == 'moneygrade':
+                    dict_predictor[_type]["predictor"] = MoneyGrade()
+                if _type == 'district':
+                    dict_predictor[_type]["predictor"] = DistrictPredictor()
             return dict_predictor[_type]["predictor"]
     raise NameError("no this type of predictor")
 
@@ -87,7 +104,7 @@ def getPredictor(_type):
 # 编号名称模型
 class CodeNamePredict():
     
-    def __init__(self,EMBED_DIM=None,BiRNN_UNITS=None,lazyLoad=getLazyLoad()):
+    def __init__(self,EMBED_DIM=None,BiRNN_UNITS=None,lazyLoad=getLazyLoad(),config=None):
         
         self.model = None
         self.MAX_LEN = None
@@ -123,8 +140,8 @@ class CodeNamePredict():
         
         self.inputs = None
         self.outputs = None
-        self.sess_codename = tf.Session(graph=tf.Graph())
-        self.sess_codesplit = tf.Session(graph=tf.Graph())
+        self.sess_codename = tf.Session(graph=tf.Graph(),config=config)
+        self.sess_codesplit = tf.Session(graph=tf.Graph(),config=config)
         self.inputs_code = None
         self.outputs_code = None
         if not lazyLoad:
@@ -536,11 +553,11 @@ class CodeNamePredict():
 class PREMPredict():
 
     
-    def __init__(self):
+    def __init__(self,config=None):
         #self.model_role_file = os.path.abspath("../role/models/model_role.model.hdf5")
         self.model_role_file = os.path.dirname(__file__)+"/../role/log/new_biLSTM-ep012-loss0.028-val_loss0.040-f10.954.h5"
-        self.model_role = Model_role_classify_word()
-        self.model_money = Model_money_classify()
+        self.model_role = Model_role_classify_word(config=config)
+        self.model_money = Model_money_classify(config=config)
         
         return
     
@@ -567,7 +584,7 @@ class PREMPredict():
                     while(p_sentences<len(list_sentence)):
                         sentence = list_sentence[p_sentences]
                         if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
-                            text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin-10):entity.wordOffset_end+10])
+                            text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin-13):entity.wordOffset_end+10])
                             #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_ROLE_INPUT_SHAPE[1]),shape=settings.MODEL_ROLE_INPUT_SHAPE)
                             item_x = self.model_role.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,entity_text=entity.entity_text)
                             data_x.append(item_x)
@@ -665,6 +682,15 @@ class PREMPredict():
                 elif re.search('尊敬的供应商:.{,25}我公司', text):
                     label = 0
                     values[label] = 0.801
+                elif re.search('尊敬的供应商:', text):
+                    label = 0
+                    values[label] = 0.501
+                elif re.search('[^\w]中标候选人:', text) and re.search('[1一]', text) == None:  #修复第4以上的预测错为中标人
+                    label = 5
+                    values[label] = 0.5
+            elif re.search('是否中标:是,供应商', text) and label == 5:
+                label = 2
+                values[label] = 0.9
             elif label == 1 and re.search('委托(单位|人|方)[是为:]+', text[:10]) and re.search('受委托(单位|人|方)[是为:]+', text[:10])==None:
                 label = 0
                 values[label] = 0.501
@@ -735,8 +761,8 @@ class PREMPredict():
 #联系人模型    
 class EPCPredict():
     
-    def __init__(self):
-        self.model_person = Model_person_classify()
+    def __init__(self,config=None):
+        self.model_person = Model_person_classify(config=config)
 
 
     
@@ -1075,13 +1101,13 @@ class EPCPredict():
 #表格预测
 class FormPredictor():
     
-    def __init__(self,lazyLoad=getLazyLoad()):
+    def __init__(self,lazyLoad=getLazyLoad(),config=None):
         self.model_file_line = os.path.dirname(__file__)+"/../form/model/model_form.model_line.hdf5"
         self.model_file_item = os.path.dirname(__file__)+"/../form/model/model_form.model_item.hdf5"
-        self.model_form_item = Model_form_item()
-        self.model_form_context = Model_form_context()
+        self.model_form_item = Model_form_item(config=config)
         self.model_dict = {"line":[None,self.model_file_line]}
-        
+        self.model_form_context = Model_form_context(config=config)
+
         
     def getModel(self,type):
         if type=="item":
@@ -1121,22 +1147,22 @@ class RoleRulePredictor():
         self.pattern_tenderee_left_w1 = "(?P<tenderee_left_w1>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)" \
                                      "(人|公司|单位|组织|用户|业主|主体|方|部门))" \
                                      "(是|为|:|:|\s*)+$)"
-        self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}委托|现将[\w()()]{5,20}[\d年月季度至]+采购意向))"
-        self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向))"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
+        self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向|尊敬的供应商(伙伴)?:\w{5,20}(以下简称“\w{2,5}”)))"
+        self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束))"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
         self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
-        self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|集采机构|[招议))]+标机构)(名称)?(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
+        self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|交易机构|集采机构|[招议))]+标机构)(名称)?(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
         self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)"  # |^受托  会与 受托生产等冲突,代理表达一般会在后面有逗号
         # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
         self.pattern_winTenderer_left = "(?P<winTenderer_left>(乙|承做|施工|供货|承包|承建|承租|竞得|受让|签约)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$|" \
                                         "(选定单位|指定的中介服务机构|实施主体|承制单位|供方)[::是为]+$|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|" \
-                                        "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(供应|供货|承销|服务|实施)(机构|单位|商|方)(名称)?[::是为]+$)"
-        self.pattern_winTenderer_left_w0 = "(?P<winTenderer_left_w1>(,|。|^)((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|客户|方|公司|厂商|商)|第?[一1]名)(名称)?[,,]?([((]按综合排名排序[))])?[::,,]$)" #解决表头识别不到加逗号情况,需前面为,。空
-        self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|客户|方|公司|厂商|商)(名称)?([((]按综合排名排序[))])?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系
+                                        "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(供应|供货|承销|承保|承包|承接|服务|实施|合作)(机构|单位|商|方)(名称)?[::是为]+$)"
+        self.pattern_winTenderer_left_w0 = "(?P<winTenderer_left_w1>(,|。|^)((中标(投标)?|中选|中价|成交)(候选)?(人|单位|机构|供应商|客户|方|公司|厂商|商)|第?[一1]名)(名称)?[,,]?([((]按综合排名排序[))])?[::,,]$)" #解决表头识别不到加逗号情况,需前面为,。空
+        self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交|入选)(候选)?(人|单位|机构|供应商|客户|方|公司|厂商|商)(名称)?([((]按综合排名排序[))])?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系
         # self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
         # self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
         self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为]((采购|中标)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
-                                        "^(报价|价格)最低,确定为本项目成交供应商|^:贵公司参与|^:?你方于|^中标。|^成为[\w、()()]+项目的成交供应商|^[((]中标人名称[))]))"
-        self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|(谈判结果:|确定)由.{5,20}(向我单位)?供货)|中标通知书.{,15}你方"   # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
+                                        "^(报价|价格)最低,确定为本项目成交供应商|^:贵公司参与|^:?你方于|^中标。|^[作]?([\w、()()]+|本|此|该)项目的?(成交|中选|中标|服务)(供应商|单位|人)|^[((](中标|成交|承包)人名??[))]))"
+        self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|决定由.{5,20}承办|(谈判结果:|确定)由.{5,20}(向我单位)?供货|中标通知书.{,15}你方|单一来源从[()\w]{5,20}采购)"   # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
 
         # self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)"
 
@@ -1167,8 +1193,8 @@ class RoleRulePredictor():
 
         self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
         
-        self.pattern_money_tenderee = re.compile("投标最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|采购(单位|人)委托价|限价|拦标价|预算金额")
-        self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报]?价))|总价|标的基本情况")
+        self.pattern_money_tenderee = re.compile("投标最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|投资估算|采购(单位|人)委托价|限价|拦标价|预算金额|标底|总计|限额")
+        self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报]?价))|总价|标的基本情况|承包价")
         self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
         self.pattern_money_other = re.compile("代理费|服务费")
         self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
@@ -1214,10 +1240,10 @@ class RoleRulePredictor():
                                                    word_flag=True, use_text=True,
                                                    text=re.sub(")", ")", re.sub("(", "(", p_entity.entity_text)))
                                 for _name in list_name:
-                                    if _name != "" and str(_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0:
+                                    if _name != "" and str(_span[0][-10:]+_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0:  #加上前面一些信息,修复公司不在项目名称开头的,检测不到
                                         find_flag = True
                                         if p_entity.values[0] > on_value:
-                                            p_entity.values[0] = 0.6 + (p_entity.values[0] - 0.6) / 10
+                                            p_entity.values[0] = 0.5 + (p_entity.values[0] - 0.5) / 10
                                         else:
                                             p_entity.values[0] = on_value  # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况
                         if find_flag:
@@ -1308,7 +1334,7 @@ class RoleRulePredictor():
                                                     _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
                                                     # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
                                                     #           "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
-                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
+                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)|第[四五六七4567]|是否中标:否',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
                                                                                                         list_spans[0]) == None:  # 2021/12/22 修正错误中标召回 例子208668937
                                                         _flag = True
                                                         _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
@@ -1343,8 +1369,10 @@ class RoleRulePredictor():
                         for _sentence in list_sentence:
                             if _sentence.sentence_index == p_entity.sentence_index:
                                 _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
-                                                   end_index=p_entity.end_index, size=20, center_include=True,
+                                                   end_index=p_entity.end_index, size=10, center_include=True,
                                                    word_flag=True, text=p_entity.entity_text)
+                                if re.search(',\w{2,}', _span[0]):
+                                    _span[0] = _span[0].split(',')[-1]  #避免多个价格在一起造成误判
                                 if re.search(self.pattern_money_tenderee, _span[0]) is not None and re.search(
                                         self.pattern_money_other, _span[0]) is None:
                                     p_entity.values[0] = 0.8 + p_entity.values[0] / 10
@@ -1415,6 +1443,14 @@ class RoleRulePredictor():
 '''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
 class RoleRuleFinalAdd():
     def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
+        '''
+        最终规则召回角色
+        :param list_articles:
+        :param list_sentences:
+        :param list_entitys:
+        :param list_codenames:
+        :return:
+        '''
         # text_end = list_articles[0].content.split('##attachment##')[0][-40:]
         main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
         end_tokens = []
@@ -1423,11 +1459,12 @@ class RoleRuleFinalAdd():
         text_end = "".join(end_tokens[-30:])
         # print(text_end)
         # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
-        sear_ent = re.search('[,。;]([\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
-        sear_ent2 = re.search('(户名|开户名称)[::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
-        sear_ent3 = re.search('(报名咨询|[收送交]货地点)[,:]([\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
-        sear_ent4 = re.search('(发布(?:人|单位|机构|企业)|项目业主)[,::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
-        sear_list = [sear_ent4 , sear_ent3 , sear_ent2 , sear_ent]
+        sear_ent = re.search('[,。;](?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
+        sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
+        sear_ent2 = re.search('[,:](户名|开户名称|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
+        sear_ent3 = re.search('(买家信息|所有权人|土地权属单位|报名咨询|[收送交]货地点|)[,:](?P<entity>[\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
+        sear_ent4 = re.search('(发布(?:人|单位|机构|企业)|项目业主|所属公司|寻源单位)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})[,。]', list_articles[0].content[:5000])
+        sear_list = [sear_ent4 , sear_ent3 , sear_ent2 ,sear_ent1, sear_ent]
 
         tenderee_notfound = True
         agency_notfound = True
@@ -1435,32 +1472,21 @@ class RoleRuleFinalAdd():
         ents = []
         for ent in list_entitys[0]:
             if ent.entity_type in ['org', 'company']:
-                if ent.label == 0:
+                if ent.label == 0 and ent.values[ent.label]>=0.5:
+                    if '公共资源交易中心' in ent.entity_text:
+                        ent.label = 5
+                        continue
                     tenderee_list.append(ent.entity_text)
                     tenderee_notfound = False
                 elif ent.label == 1:
                     agency_notfound = False
                 elif ent.label == 5:
+                    if '公共资源交易中心' in ent.entity_text:
+                        continue
                     ents.append(ent)
-        if sear_ent or sear_ent2 or sear_ent3 or sear_ent4:
+        if sear_ent or sear_ent1 or sear_ent2 or sear_ent3 or sear_ent4:
             for _sear_ent in [_sear for _sear in sear_list if _sear]:
-                # if sear_ent4:
-                #     ent_re = sear_ent4.group(2)
-                # elif sear_ent3:
-                #     ent_re = sear_ent3.group(2)
-                # elif sear_ent2:
-                #     ent_re = sear_ent2.group(2)
-                # else:
-                #     ent_re = sear_ent.group(1)
-                if _sear_ent==sear_ent4:
-                    ent_re = _sear_ent.group(2)
-                elif _sear_ent==sear_ent3:
-                    ent_re = _sear_ent.group(2)
-                elif _sear_ent==sear_ent2:
-                    ent_re = _sear_ent.group(2)
-                else:
-                    ent_re = _sear_ent.group(1)
-                # print('ent_re', ent_re)
+                ent_re = _sear_ent.group('entity')
                 ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
 
                 if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent_re)
@@ -1490,8 +1516,8 @@ class RoleRuleFinalAdd():
                             agency_notfound = False
                             # log('正则最后补充实体: %s'%(ent_re))
                             break
-                if not tenderee_notfound:
-                    break
+                    if not tenderee_notfound:
+                        break
 
         elif list_codenames[0]['name'] != "":  #把标题包含的公司实体作为招标人
             # tenderee_notfound = True
@@ -1510,6 +1536,7 @@ class RoleRuleFinalAdd():
                     if ent.entity_text in list_codenames[0]['name']:
                         ent.label = 0
                         ent.values[0] = 0.5
+                        tenderee_notfound == False
                         # log('正则召回标题中包含的实体:%s'%ent.entity_text)
                         break
 
@@ -1793,10 +1820,115 @@ class TendereeRuleRecall():
                 list_entitys[0] = sorted(list_entitys[0], key=lambda x: (x.sentence_index, x.begin_index))
                 break
 
+class RoleGrade():
+    def __init__(self):
+        self.tenderee_left_9 = "(?P<tenderee_left_9>(招标|采购|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|甲)(人|方|单位))"
+        self.tenderee_center_9 = "(?P<tenderee_center_9>受.{5,20}委托)"
+        self.tenderee_left_8 = "(?P<tenderee_left_8>(业主|转让方|尊敬的供应商|出租方|处置方|(需求|建设|最终|发包)(人|方|单位|组织|用户|业主|主体|部门|公司)))"
+        self.agency_left_9 = "(?P<agency_left_9>代理)"
+        self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得|乙方)|第[1一]|排名:1)"
+        self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商))"
+        self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排名:2))"
+        self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排名:3))"
+        self.pattern_list = [self.tenderee_left_9,self.tenderee_center_9, self.tenderee_left_8,self.agency_left_9, self.winTenderer_left_9,
+                             self.winTenderer_left_8, self.secondTenderer_left_9, self.thirdTenderer_left_9]
+    def predict(self, list_sentences, list_entitys, span=10, min_prob=0.7):
+        '''
+        根据规则给角色分配不同等级概率;分三级:0.9-1,0.8-0.9,0.7-0.8;附件0.7-0.8,0.6-0.7,0.5-0.6
+        :param list_articles:
+        :param list_sentences:
+        :param list_entitys:
+        :param codeName:
+        :return:
+        '''
+        sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
+        role2id = {"tenderee": 0, "agency": 1, "winTenderer": 2, "secondTenderer": 3, "thirdTenderer": 4}
+        for entity in list_entitys[0]:
+            if entity.entity_type in ['org', 'company'] and entity.label in [0, 1, 2, 3, 4] and entity.values[entity.label]> 0.5:
+                text = sentences[entity.sentence_index].sentence_text
+                in_att = sentences[entity.sentence_index].in_attachment
+                pre_prob = entity.values[entity.label]
+                b = entity.wordOffset_begin
+                e = entity.wordOffset_end
+                not_found = 1
+                for pattern in self.pattern_list:
+                    if 'left' in pattern:
+                        context = text[max(0, b-span):b]
+                    elif 'right' in pattern:
+                        context = text[e:e+span]
+                    elif 'center' in pattern:
+                        context = text[max(0, b-span):e+span]
+                    else:
+                        print('规则错误', pattern)
+                    ser = re.search(pattern, context)
+                    if ser:
+                        groupdict = pattern.split('>')[0].replace('(?P<', '')
+                        _role, _direct, _prob = groupdict.split('_')
+                        _label = role2id.get(_role)
+                        if _label != entity.label:
+                            continue
+                        _prob = int(_prob)*0.1
+                        # print('规则修改角色概率前:', entity.entity_text, entity.label, entity.values)
+                        if in_att:
+                            _prob = _prob - 0.2
+                        if pre_prob < _prob:
+                            _prob = 0.65
+                        entity.values[_label] = _prob + entity.values[_label] / 20
+                        not_found = 0
+                        # print('规则修改角色概率后:', entity.entity_text, entity.label, entity.values)
+                        break
+                if not_found and entity.values[entity.label]> min_prob:
+                    _prob = min_prob - 0.1 if in_att else min_prob
+                    entity.values[entity.label] = _prob + entity.values[entity.label] / 20
+                    # print('找不到规则修改角色概率:', entity.entity_text, entity.label, entity.values)
+
+
+class MoneyGrade():
+    def __init__(self):
+        self.tenderee_money_left_9 = "(?P<tenderee_left_9>最高(投标)?限价)|控制价|拦标价"
+        self.tenderee_money_left_8 = "(?P<tenderee_left_8>预算|限价|起始|起拍|底价|标底)"
+        self.tenderer_money_left_9 = "(?P<tenderer_left_9>(中标|成交|合同|总报价))"
+        self.tenderer_money_left_8 = "(?P<tenderer_left_8>(投标|总价))"
+
+        self.pattern_list = [self.tenderee_money_left_9, self.tenderee_money_left_8, self.tenderer_money_left_9]
+
+    def predict(self, list_sentences, list_entitys, span=10, min_prob=0.7):
+        sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
+        role2id = {"tenderee": 0, "tenderer": 1}
+        for entity in list_entitys[0]:
+            if entity.entity_type in ['money'] and entity.label in [0, 1] and entity.values[entity.label]> 0.6:
+                text = sentences[entity.sentence_index].sentence_text
+                in_att = sentences[entity.sentence_index].in_attachment
+                b = entity.wordOffset_begin
+                e = entity.wordOffset_end
+                context = text[max(0, b - span):b]
+                not_found = 1
+                for pattern in self.pattern_list:
+                    ser = re.search(pattern, context)
+                    if ser:
+                        groupdict = pattern.split('>')[0].replace('(?P<', '')
+                        _role, _direct, _prob = groupdict.split('_')
+                        _label = role2id.get(_role)
+                        if _label != entity.label:
+                            continue
+                        _prob = int(_prob) * 0.1
+                        # print('规则修改金额概率前:', entity.entity_text, entity.label, entity.values)
+                        if in_att:
+                            _prob = _prob - 0.2
+                        entity.values[_label] = _prob + entity.values[_label] / 20
+                        not_found = 0
+                        # print('规则修改金额概率后:', entity.entity_text, entity.label, entity.values)
+                        break
+                if not_found and entity.values[entity.label] > min_prob:
+                    _prob = min_prob - 0.1 if in_att else min_prob
+                    entity.values[entity.label] = _prob + entity.values[entity.label] / 20
+                    # print('找不到规则修改金额概率:', entity.entity_text, entity.label, entity.values)
+
+
 # 时间类别
 class TimePredictor():
-    def __init__(self):
-        self.sess = tf.Session(graph=tf.Graph())
+    def __init__(self,config=None):
+        self.sess = tf.Session(graph=tf.Graph(),config=config)
         self.inputs_code = None
         self.outputs_code = None
         self.input_shape = (2,40,128)
@@ -1900,11 +2032,11 @@ class TimePredictor():
 
 # 产品字段提取
 class ProductPredictor():
-    def __init__(self):
+    def __init__(self,config=None):
         vocabpath = os.path.dirname(__file__) + "/codename_vocab.pk"
         self.vocab = load(vocabpath)
         self.word2index = dict((w, i) for i, w in enumerate(np.array(self.vocab)))
-        self.sess = tf.Session(graph=tf.Graph())
+        self.sess = tf.Session(graph=tf.Graph(),config=config)
         self.load_model()
 
     def load_model(self):
@@ -2097,6 +2229,7 @@ class ProductAttributesPredictor():
                 continue
             for td in tds:
                 td_text = re.sub('\s', '', td.get_text())
+                td_text = td_text.replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '/') # 修复272144312 # 产品单价数量提取结果有特殊符号\  气动执行装置备件\密封组件\NBR+PT
                 tr_line.append(td_text)
             inner_table.append(tr_line)
         return inner_table
@@ -2789,9 +2922,9 @@ class ProductAttributesPredictor():
 
 # docchannel类型提取
 class DocChannel():
-  def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb'):
+  def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb',config=None):
     self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
-    self.mask, self.mask_title = self.load_life(life_model)
+    self.mask, self.mask_title = self.load_life(life_model,config)
     self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
     self.type_mask, self.type_mask_title = self.load_type(type_model)
     self.sequen_len = 200  # 150 200
@@ -2831,13 +2964,13 @@ class DocChannel():
           '公告变更': '第[\d一二]次变更|(更正|变更)(公告|公示|信息|内容|事项|原因|理由|日期|时间|如下)|原公告((主要)?(信息|内容)|发布时间)|(变更|更正)[前后]内容|现?在?(变更|更正|修改|更改)(内容)?为|(公告|如下|信息|内容|事项|结果|文件|发布|时间|日期)(更正|变更)',
           '候选人公示': '候选人公示|评标结果公示',
           '中标信息': '供地结果信息|采用单源直接采购的?情况说明|[特现]?将\w{,4}(成交|中标|中选|选定结果|选取结果|入围结果)\w{,4}(进行公示|公[示布]如下)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|((中标|中选)(候选人|人|成交)|成交)\w{,3}(信息|情况)[::\s]',
-          '中标信息2': '\s(成交|中标|中选)(信息|日期|时间|总?金额|价格)[::\s]|(采购|招标|成交|中标|中选|评标)结果|单一来源采购原因|拟采取单一来源方式采购',
+          '中标信息2': '\s(成交|中标|中选)(信息|日期|时间|总?金额|价格)[::\s]|(采购|招标|成交|中标|中选|评标)结果|单一来源采购原因|拟采取单一来源方式采购|单一来源采购公示',
           '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让|唯一)(供应商|供货商|服务商|机构|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]',
           '中标信息neg': '按项目控制价下浮\d%即为成交价|成交原则|不得确定为(中标|成交)|招标人按下列原则选择中标人|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)[:\s]|竞拍起止时间:|询价结果[\s\n::]*不公开|本项目已具备招标条件|现对该项目进行招标公告|发布\w{2}结果后\d天内送达|本次\w{2}结果不对外公示',
       # |确定成交供应商[:,\s]
           '合同公告': '合同(公告|公示|信息|内容)|合同(编号|名称|主体|基本情况|签订日期)|(供应商乙方|乙方供应商):|合同总?金额',
-          '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):?废标|((本|该)项目|本标段|本次(招标)?)((采购|招标)?(失败|终止|流标|废标)|予以废标|(按|做|作)?(流标|废标)处理)|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答|项目)(终止|中止|废标|流标|失败|作废|异常|撤销)',
-          '废标公告2': '(无效|中止|终止|废标|流标|失败|作废|异常|撤销)的?原因|本项目因故取消|本(项目|次)(公开)?\w{2}失败|已终止\s*原因:|(人数|供应商|单位)不足|已终止'
+          '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):?废标|((本|该)(项目|标段|合同|合同包|采购包|次)\w{,5})((失败|终止|流标|废标)|予以废标|(按|做|作)?(流标|废标)处理)|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答|项目)(终止|中止|废标|流标|失败|作废|异常|撤销)',
+          '废标公告2': '(无效|中止|终止|废标|流标|失败|作废|异常|撤销)的?(原因|理由)|本项目因故取消|本(项目|次)(公开)?\w{2}失败|已终止\s*原因:|(人|人数|供应商|单位)(不足|未达\w{,3}数量)|已终止|不足[3三]家|无(废标)'
       }
       self.title_life_dic = {
           '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示|意向公开',
@@ -2852,7 +2985,7 @@ class DocChannel():
           '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告',
       }
 
-  def load_life(self,life_model):
+  def load_life(self,life_model,config):
     with tf.Graph().as_default() as graph:
       output_graph_def = graph.as_graph_def()
       with open(os.path.dirname(__file__)+life_model, 'rb') as f:
@@ -2860,7 +2993,7 @@ class DocChannel():
         tf.import_graph_def(output_graph_def, name='')
         # print("%d ops in the final graph" % len(output_graph_def.node))
         del output_graph_def
-        sess = tf.Session(graph=graph)
+        sess = tf.Session(graph=graph,config=config)
         sess.run(tf.global_variables_initializer())
         inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
         prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
@@ -3108,9 +3241,9 @@ class DocChannel():
               else:
                   html = html[:ser.start() + 500]
           text = re.sub('<[^<]*?>', '', html).replace('&nbsp;', ' ')
-          text = re.sub('http[0-9a-zA-Z-.:/]+|[0-9a-zA-Z-./@]+', '', text)
+          # text = re.sub('http[0-9a-zA-Z-.:/]+|[0-9a-zA-Z-./@]+', '', text)
           text = re.sub('\s+', ' ', text)
-          text = re.sub('[/|[()()]', '', text)
+          # text = re.sub('[/|[()()]', '', text)
           text = cut_single_cn_space(text)
           return text[:20000]
 
@@ -3213,7 +3346,6 @@ class DocChannel():
                   life_list = [k]
               elif life_score[k] == max_score and life_score[k] > 0:
                   life_list.append(k)
-
           if '采购意向' in life_kw_title or '采购意向' in life_list:
               return '采购意向', msc
           elif '招标预告' in life_kw_title or '招标预告' in life_list:
@@ -3241,18 +3373,23 @@ class DocChannel():
           elif '候选人公示' in life_kw_title or '候选人公示' in life_list:
               if '招标公告' in life_kw_title and life_score.get('招标公告', 0) > 3:
                   return '招标公告', msc
+              elif '废标公告' in life_kw_title or life_score.get('废标公告', 0) > 5:
+                  return '废标公告', msc
               return '候选人公示', msc
           elif '合同公告' in life_kw_title or '合同公告' in life_list:
               if '招标公告' in life_kw_title and life_score.get('招标公告', 0) > 3:
                   return '招标公告', msc
+              elif '废标公告' in life_kw_title or life_score.get('废标公告', 0) > 5:
+                  return '废标公告', msc
               return '合同公告', msc
+
           elif '中标信息' in life_kw_title or '中标信息' in life_list:
               if '招标公告' in life_kw_title and life_score.get('招标公告',
                                                             0) > 2:  # (life_score.get('招标公告', 0)>2 or life_score.get('中标信息', 0)<4) 0.7886409793924245
                   return '招标公告', msc
-              elif '废标公告' in life_kw_title:
+              elif '废标公告' in life_kw_title or life_score.get('废标公告', 0) > 5:
                   return '废标公告', msc
-              elif life_score.get('候选人公示', 0) >= 3:
+              elif life_score.get('候选人公示', 0) > 3:
                   return '候选人公示', msc
               elif life_score.get('合同公告', 0) > 5:
                   return '合同公告', msc
@@ -3315,10 +3452,11 @@ class DocChannel():
           2、废标公告有中标人且标题无废标关键词,返回中标信息
           3、答疑公告标题无答疑关键且原始为招标,返回原始类别
           4、招标公告有中标人且原始为中标,返回中标信息
-          5、预测及原始均在招标、预告、意向,返回原始类别
+          5、预测为招标,原始为预告、意向,返回原始类别
           6、预测及原始均在变更、答疑,返回原始类别
           7、预测为采招数据,原始为产权且有关键词,返回原始类别
           8、废标公告原始为招标、预告且标题无废标关键期,返回原始类别
+          9、若预测为非采招数据且源网为采招数据且标题无关键词返回采招数据
           '''
           if result['docchannel']['docchannel'] in ['中标信息', '合同公告'] and origin_dic.get(
                   original_docchannel, '') in ['招标公告', '采购意向', '招标预告', '公告变更'] and is_contain_winner(prem_json)==False:
@@ -3337,8 +3475,8 @@ class DocChannel():
                   original_docchannel, '') == '中标信息':
               result['docchannel']['docchannel'] = '中标信息'
               msc += '最终规则修改:预测为招标公告却有中标人且原始为中标改为中标信息;'
-          elif result['docchannel']['docchannel'] in ['招标公告', '采购意向', '招标预告'] and origin_dic.get(
-                  original_docchannel, '') in ['招标公告', '采购意向', '招标预告']:
+          elif result['docchannel']['docchannel'] in ['招标公告'] and origin_dic.get(
+                  original_docchannel, '') in ['采购意向', '招标预告']:
               result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
               msc += '最终规则修改:预测及原始均在招标、预告、意向,返回原始类别'
           elif result['docchannel']['docchannel'] in ['招标答疑', '公告变更'] and origin_dic.get(
@@ -3354,6 +3492,10 @@ class DocChannel():
                   self.title_life_dic['废标公告'], title) == None:
               result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
               msc += '最终规则修改:废标公告原始为招标、预告且标题无废标关键期,返回原始类别;'
+          elif result['docchannel']['doctype'] != '采招数据' and origin_dic.get(
+                  original_docchannel, '') not in ['产权交易', '土地矿产', '拍卖出让'] and re.search('产权|转让|受让|招租|出租|承租|竞价|资产|挂牌|出让|拍卖|招拍|划拨', title)==None:
+              result['docchannel']['doctype'] = '采招数据'
+              msc += '最终规则修改:预测为非采招数据,原始为采招数据且无关键词,返回采招数据'
 
           '''下面是新格式增加返回字段'''
           if result['docchannel']['docchannel'] != '':  # 预测到生命周期的复制到life_docchannel,否则用数据源结果
@@ -3425,7 +3567,7 @@ class DocChannel():
               type_id, type_prob = type_model_predict()
               type_model = self.id2type[type_id]
               result['docchannel']['doctype'] = type_model
-              msc += type_model + ';'
+              msc += type_model + ' 概率:%.4f;'%type_prob
               # print('公告类别:', self.id2type[id], '概率:',prob)
               # if id == 0:
           if doc_life=="" and result['docchannel']['doctype'] not in ['', '新闻资讯']:
@@ -3433,7 +3575,7 @@ class DocChannel():
                   life_id, life_prob = life_model_predict()
                   life_model = self.id2life[life_id]
                   result['docchannel']['docchannel'] = life_model
-                  msc += life_model + ';\n'
+                  msc += life_model + ' 概率:%.4f;\n'%life_prob
 
       msc = final_change(msc)
       # print('channel ', msc)
@@ -4034,6 +4176,142 @@ class IndustryPredictor():
                             }
                 }
 
+class DistrictPredictor():
+    def __init__(self):
+        with open(os.path.dirname(__file__)+'/district_dic.pkl', 'rb') as f:
+            dist_dic = pickle.load(f)
+            short_name = '|'.join(sorted(set([v['简称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
+            full_name = '|'.join(sorted(set([v['全称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
+            short2id = {}
+            full2id = {}
+            for k, v in dist_dic.items():
+                if v['简称'] not in short2id:
+                    short2id[v['简称']] = [k]
+                else:
+                    short2id[v['简称']].append(k)
+                if v['全称'] not in full2id:
+                    full2id[v['全称']] = [k]
+                else:
+                    full2id[v['全称']].append(k)
+            self.dist_dic = dist_dic
+            self.short_name = short_name
+            self.full_name = full_name
+            self.short2id = short2id
+            self.full2id = full2id
+
+    def predict(self, project_name, prem, title, list_articles, web_source_name = ""):
+        '''
+        先匹配 project_name+tenderee+tenderee_address, 如果缺少省或市 再匹配 title+content
+        :param project_name:
+        :param prem:
+        :param title:
+        :param list_articles:
+        :param web_source_name:
+        :return:
+        '''
+        def get_ree_addr(prem):
+            tenderee = ""
+            tenderee_address = ""
+            try:
+                for v in prem[0]['prem'].values():
+                    for link in v['roleList']:
+                        if link['role_name'] == 'tenderee' and tenderee == "":
+                            tenderee = link['role_text']
+                            tenderee_address = link['address']
+            except Exception as e:
+                print('解析prem 获取招标人、及地址出错')
+            return tenderee, tenderee_address
+        def get_area(text, web_source_name):
+            score_l = []
+            id_set = set()
+
+            if re.search(self.short_name, text):
+                for it in re.finditer(self.full_name, text):
+                    name = it.group(0)
+                    score = len(name) / len(text)
+                    for _id in self.full2id[name]:
+                        area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
+                        # score_l.append([_id, score] + area)
+                        w = self.dist_dic[_id]['权重']
+                        score_l.append([_id, score + w] + area)
+
+                flag = 0
+                for it in re.finditer(self.short_name, text):
+                    if it.end() < len(text) and re.search('^(村|镇|街|路|江|河|湖|北路|南路|东路|大道|社区)', text[it.end():]) == None:
+                        name = it.group(0)
+                        score = (it.start() + len(name)) / len(text)
+                        for _id in self.short2id[name]:
+                            score2 = 0
+                            w = self.dist_dic[_id]['权重']
+                            _type = self.dist_dic[_id]['类型']
+                            area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
+                            if area[0] in ['2', '16', '20', '30']:
+                                _type += 10
+                            score2 += w
+                            if _id not in id_set:
+                                if _type == 20:
+                                    type_w = 3
+                                elif _type == 30:
+                                    type_w = 2
+                                else:
+                                    type_w = 1
+                                id_set.add(_id)
+                                score2 += w * type_w
+                            score_l.append([_id, score * w + score2] + area)
+
+                if flag == 1:
+                    pass
+                #         print('score', score)
+            if re.search('公司', web_source_name) == None:
+                for it in re.finditer(self.short_name, web_source_name):
+                    name = it.group(0)
+                    for _id in self.short2id[name]:
+                        area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
+                        w = self.dist_dic[_id]['权重']
+                        score = w * 0.2
+                        score_l.append([_id, score] + area)
+            area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知'}
+            if len(score_l) == 0:
+                return {'district': area_dic}
+            else:
+                df = pd.DataFrame(score_l, columns=['id', 'score', 'province', 'city', 'district'])
+                df_pro = df.groupby('province').sum().sort_values(by=['score'], ascending=False)
+                pro_id = df_pro.index[0]
+                if df_pro.loc[pro_id, 'score'] < 0.1:  # 省级评分小于0.1的不要
+                    # print('评分低于0.1', df_pro.loc[pro_id, 'score'], self.dist_dic[pro_id]['地区'])
+                    return {'district': area_dic}
+                area_dic['province'] = self.dist_dic[pro_id]['地区']
+                area_dic['area'] = self.dist_dic[pro_id]['大区']
+                df = df[df['city'] != ""]
+                df = df[df['province'] == pro_id]
+                if len(df) > 0:
+                    df_city = df.groupby('city').sum().sort_values(by=['score'], ascending=False)
+                    city_id = df_city.index[0]
+                    area_dic['city'] = self.dist_dic[city_id]['地区']
+                    df = df[df['district'] != ""]
+                    df = df[df['city'] == city_id]
+                    if len(df) > 0:
+                        df_dist = df.groupby('district').sum().sort_values(by=['score'], ascending=False)
+                        dist_id = df_dist.index[0]
+                        area_dic['district'] = self.dist_dic[dist_id]['地区']
+                # print(area_dic)
+                return {'district': area_dic}
+
+        tenderee, tenderee_address = get_ree_addr(prem)
+        project_name = str(project_name).replace(str(tenderee), '')
+        text1 = "{} {} {}".format(project_name, tenderee, tenderee_address)
+        web_source_name = str(web_source_name)  # 修复某些不是字符串类型造成报错
+        text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1)  #预防提取错 合肥 路南 新会 等地区
+        rs = get_area(text1, web_source_name)
+        if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
+            text2 = title + list_articles[0].content if len(list_articles[0].content)<2000 else title + list_articles[0].content[:1000] + list_articles[0].content[-1000:]
+            text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
+            rs2 = get_area(text2, web_source_name)
+            if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
+                rs = rs2
+            elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知':
+                rs = rs2
+        return rs
 
 
 def getSavedModel():

BIN
BiddingKG/dl/table_head/best_tiny.hdf5


+ 119 - 1
BiddingKG/dl/table_head/models/model.py

@@ -73,6 +73,124 @@ def model_1(input_shape, output_shape):
     return model
 
 
+def model_1_small(input_shape, output_shape):
+    # Input (batch, 10, 60)
+    input_1 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_2 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_3 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_4 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_5 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_6 = layers.Input(shape=input_shape[1:], dtype="float32")
+
+    # ----------- Three box sequence -----------
+    # Concat (batch, 30, 60)
+    concat_1 = layers.concatenate([input_1, input_2, input_3], axis=-2, name='seq_concat')
+    concat_2 = layers.concatenate([input_4, input_5, input_6], axis=-2)
+
+    # Bi-LSTM (batch, 30, 128)
+    bi_lstm_1 = layers.Bidirectional(layers.LSTM(32, return_sequences=True))(concat_1)
+    bi_lstm_2 = layers.Bidirectional(layers.LSTM(32, return_sequences=True))(concat_2)
+
+    # Self-Attention (batch, 30, 128)
+    self_attention_1 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_1)
+    self_attention_2 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_2)
+
+    # Dense (batch, 30, 1)
+    dense_1 = layers.Dense(output_shape[0], activation="relu")(self_attention_1)
+    dense_2 = layers.Dense(output_shape[0], activation="relu")(self_attention_2)
+
+    # Squeeze (batch, 30)
+    squeeze_1 = Lambda(lambda x: K.squeeze(x, axis=-1))(dense_1)
+    squeeze_2 = Lambda(lambda x: K.squeeze(x, axis=-1))(dense_2)
+
+    # ----------- One box feature -----------
+    # Bi-LSTM (batch, 10, 128)
+    bi_lstm = layers.Bidirectional(layers.LSTM(32, return_sequences=True))(input_2)
+
+    # Self-Attention (batch, 10, 128)
+    self_attention = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm)
+
+    # mask mean pooling
+    # pool_1 = MyAveragePooling1D(axis=-1)(self_attention_1)
+
+    # Dense (batch, 10, 1)
+    dense = layers.Dense(output_shape[0], activation="relu")(self_attention)
+
+    # Squeeze (batch, 10) - one box feature
+    squeeze = Lambda(lambda x: K.squeeze(x, axis=-1))(dense)
+
+    # ----------- Three box sequence & One box feature -----------
+    # Dense (batch, 1)
+    concat = layers.concatenate([squeeze, squeeze_1, squeeze_2])
+    output = layers.Dense(32, activation='relu')(concat)
+    output = layers.Dense(1, activation="sigmoid", name='output')(output)
+
+    model = models.Model(inputs=[input_1, input_2, input_3, input_4, input_5, input_6],
+                         outputs=output)
+
+    # model.summary()
+    return model
+
+
+def model_1_tiny(input_shape, output_shape):
+    # Input (batch, 10, 60)
+    input_1 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_2 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_3 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_4 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_5 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_6 = layers.Input(shape=input_shape[1:], dtype="float32")
+
+    # ----------- Three box sequence -----------
+    # Concat (batch, 30, 60)
+    concat_1 = layers.concatenate([input_1, input_2, input_3], axis=-2, name='seq_concat')
+    concat_2 = layers.concatenate([input_4, input_5, input_6], axis=-2)
+
+    # Bi-LSTM (batch, 30, 128)
+    bi_lstm_1 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(concat_1)
+    bi_lstm_2 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(concat_2)
+
+    # Self-Attention (batch, 30, 128)
+    self_attention_1 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_1)
+    self_attention_2 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_2)
+
+    # Dense (batch, 30, 1)
+    dense_1 = layers.Dense(output_shape[0], activation="relu")(self_attention_1)
+    dense_2 = layers.Dense(output_shape[0], activation="relu")(self_attention_2)
+
+    # Squeeze (batch, 30)
+    squeeze_1 = Lambda(lambda x: K.squeeze(x, axis=-1))(dense_1)
+    squeeze_2 = Lambda(lambda x: K.squeeze(x, axis=-1))(dense_2)
+
+    # ----------- One box feature -----------
+    # Bi-LSTM (batch, 10, 128)
+    bi_lstm = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_2)
+
+    # Self-Attention (batch, 10, 128)
+    self_attention = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm)
+
+    # mask mean pooling
+    # pool_1 = MyAveragePooling1D(axis=-1)(self_attention_1)
+
+    # Dense (batch, 10, 1)
+    dense = layers.Dense(output_shape[0], activation="relu")(self_attention)
+
+    # Squeeze (batch, 10) - one box feature
+    squeeze = Lambda(lambda x: K.squeeze(x, axis=-1))(dense)
+
+    # ----------- Three box sequence & One box feature -----------
+    # Dense (batch, 1)
+    concat = layers.concatenate([squeeze, squeeze_1, squeeze_2])
+    output = layers.Dense(16, activation='relu')(concat)
+    output = layers.Dense(1, activation="sigmoid", name='output')(output)
+
+    model = models.Model(inputs=[input_1, input_2, input_3, input_4, input_5, input_6],
+                         outputs=output)
+
+    # model.summary()
+    return model
+
+
 def model_2(input_shape, output_shape):
     # input_shape = (None, None, 10, 60)
     # (batch_size, row_num, col_num, character_num, character_embedding)
@@ -266,7 +384,7 @@ def model_3(input_shape, output_shape):
 
 def get_model(input_shape, output_shape, model_id):
     if model_id == 1:
-        return model_1(input_shape, output_shape)
+        return model_1_tiny(input_shape, output_shape)
     elif model_id == 2:
         return model_2(input_shape, output_shape)
     elif model_id == 3:

+ 11 - 4
BiddingKG/dl/table_head/predict.py

@@ -24,9 +24,12 @@ if model_id == 1:
 else:
     input_shape = (None, None, 20, 60)
     output_shape = (None, None)
-keras_model_path = os.path.abspath(os.path.dirname(__file__)) + "/best.hdf5"
+keras_model_path = os.path.abspath(os.path.dirname(__file__)) + "/best_tiny.hdf5"
 # keras模型加载预测都使用同一个session、同一个graph,即可多进程推理
-sess = tf.Session(graph=tf.Graph())
+session_conf = tf.ConfigProto(
+    intra_op_parallelism_threads=5,
+    inter_op_parallelism_threads=5)
+sess = tf.Session(graph=tf.Graph(), config=session_conf)
 # graph = tf.get_default_graph()
 
 # tf_model_path = os.path.abspath(os.path.dirname(__file__)) + '/best_pb/1'
@@ -49,6 +52,7 @@ sess = tf.Session(graph=tf.Graph())
 
 
 def predict(table_text_list, model_id=1):
+    start_time = time.time()
     if globals().get("model") is None:
         print("="*15, "init table_head model", "="*15)
         with sess.as_default():
@@ -76,13 +80,15 @@ def predict(table_text_list, model_id=1):
         predict_x = my_data_loader_2(data_list, [], 1, is_train=False)
 
     # 预测
+    # start_time = time.time()
     with sess.as_default():
         with sess.graph.as_default():
-            # predict_result = model.predict_generator(predict_x, steps=steps)
+            # predict_result = model.predict_generator(predict_x, steps=1)
             # 设置batch size为1最快,默认为32很慢
             predict_result = model.predict([predict_x[0], predict_x[1], predict_x[2],
                                             predict_x[3], predict_x[4], predict_x[5]],
-                                           batch_size=1)
+                                           batch_size=256)
+    # print("table head predict time", time.time()-start_time, predict_x.shape)
 
     # 数据后处理
     if model_id == 1:
@@ -92,6 +98,7 @@ def predict(table_text_list, model_id=1):
 
     # 打印保存结构
     # save_print_result(table_text_list, table_label_list)
+    # print("table_head predict cost", str(time.time()-start_time))
     return table_label_list
 
 

+ 3 - 3
BiddingKG/dl/table_head/train.py

@@ -19,10 +19,10 @@ if model_id == 1:
     output_shape = (1,)
     batch_size = 128
     epochs = 1000
-    PRETRAINED = True
+    PRETRAINED = False
     CHECKPOINT = False
     # 用GPU
-    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 else:
     input_shape = (None, None, 20, 60)
     output_shape = (None, None)
@@ -92,7 +92,7 @@ def train():
                   # loss_weights={"output": 0.5},
                   metrics=['acc', precision, recall, f1])
 
-    rlu = ReduceLROnPlateau(monitor='val_f1', factor=0.1, patience=10,
+    rlu = ReduceLROnPlateau(monitor='val_f1', factor=0.5, patience=10,
                             verbose=1, mode='max', cooldown=0, min_lr=0)
 
     model.fit_generator(train_data_loader,

+ 4 - 1
BiddingKG/dl_dev/test/12.py

@@ -1,4 +1,7 @@
-print("243705217")
+import re
 
 
+import time
 
+
+print(time.localtime(1663878377786/1000))

+ 3 - 3
BiddingKG/dl_dev/test/test4.py

@@ -53,9 +53,9 @@ def test(name,content,_url=None):
     # _resp = requests.post(list_url[_i], json=user, headers=myheaders, verify=True)
 
     # _url = "http://1255640119316927.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/content_extract"
-    _url = "http://192.168.2.102:15030/test"
-    _url = "http://192.168.2.102:15030/industry_extract"
-    _url = "http://192.168.2.102:15030/content_extract"
+    _url = "http://127.0.0.1:15030/content_extract"
+    # _url = "http://192.168.2.102:15030/industry_extract"
+    # _url = "http://192.168.2.102:15030/content_extract"
 
     _resp = session.post(_url, json=user,verify=True,timeout=1000)
     # _resp = requests.post("http://192.168.2.102:15000" + '/article_extract', json=user, headers=myheaders, verify=True)

+ 6 - 0
BiddingKG/hello.html

@@ -0,0 +1,6 @@
+from Flask</title>
+{% if name %}
+<h1>Hello {{ name }}!</h1>
+{% else %}
+<h1>Hello World!</h1>
+{% endif %}

+ 1 - 1
BiddingKG/maxcompute/documentDumplicate.py

@@ -2141,7 +2141,7 @@ if __name__ == '__main__':
 #     c = f_get_nlp_enterprise()
 #     print(c.evaluate("山东东岳项目管理有限公司",_json))
 #     print(c.evaluate(_json))
-#     c = f_set_docid()
+#     c = f_set_docid()f_get_single_merged_bychannel
 #     _s = '''
 #     154064190	1512489600	4	03689-11	1	大连市妇女儿童医疗中心
 #     154064188	1512489600	4	03689-11	1	大连市妇女儿童医疗中心

+ 15 - 9
BiddingKG/maxcompute/documentMerge.py

@@ -364,6 +364,16 @@ class f_get_single_merged_bychannel(BaseUDTF):
         _d = {"data":{str(docid):[]},"process_time":getCurrent_date()}
         self.forward(json.dumps(_d))
 
+@annotate('string->string')
+class f_get_single_merged_docids(object):
+
+    def evaluate(self,_json):
+        if _json!="" and _json is not None:
+            _d = json.loads(_json)
+            _keys = _d.get("data",{}).keys()
+            return ",".join(list(_keys))
+        return ""
+
 
 
 
@@ -1252,14 +1262,8 @@ if __name__ == '__main__':
     a = f_remege_limit_num_contain_bychannel()
     buffer = a.new_buffer()
     tmp_s = '''
-    234858920	229011768	2022-03-25	1648137600		横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工招标文件.pdf	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工文件.pdf	珠海大横琴公共设施建设管理有限公司	珠海德联工程咨询有限公司				103	0	7	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "2022-04-29", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
-    234858920	232745950	2022-04-12	1649692800	E4404000001002779001	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工招标答疑	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工答疑	珠海大横琴公共设施建设管理有限公司	珠海德联工程咨询有限公司				103	0	8	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
-    234858920	234858920	2022-04-21	1650470400	E4404000001002779001001	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工						101	1	2	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
-    234858920	234595980	2022-04-20	1650384000	E4404000001002779001001	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工	珠海大横琴公共设施建设管理有限公司	珠海德联工程咨询有限公司				105	0	10	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-04-22", "time_publicity_start": "2022-04-21", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
-    234858920	228908786	2022-03-25	1648137600	E4404000001002779001001	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工	珠海大横琴公共设施建设管理有限公司	珠海德联工程咨询有限公司			1795743.68	52	0	8	"{"time_bidclose": "2022-04-20", "time_bidopen": "2022-04-20", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "2022-04-20", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "2022-03-26", "time_publicity_end": "2022-04-26", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
-    234858920	234523333	2022-04-20	1650384000	E4404000001002779001001	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工						101	0	2	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
-    234858920	234787082	2022-04-20	1650384000	E4404000001002779001001	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工开标记录表	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工开标记录表					1795743.68	101	0	6	"{"time_bidclose": "", "time_bidopen": "2022-04-20", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
-    234858920	235240618	2022-04-22	1650556800	E4404000001002779001001	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工			广东博思信息技术股份有限公司	1775136.23		101	0	12	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-04-26", "time_publicity_start": "2022-04-24", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    266523906	266539038	2022-09-08	1662566400	SDGP371525000202201000421_A	冠县第二实验小学平台教育信息化设备采购智慧屏	冠县第二实验小学平台教育信息化设备采购智慧屏成交公告	冠县第二实验小学平台教育信息化设备智慧屏	冠县第二实验小学	聊城市采购中心	山东润博网络有限公司	246890.0		101	0	12	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    266523906	266523906	2022-09-15	1663171200	SDGP371525000202201000421_A	冠县第二实验小学平台教育信息化设备采购智慧屏	冠县第二实验小学平台教育信息化设备采购智慧屏成交公告	冠县第二实验小学平台教育信息化设备智慧屏	冠县第二实验小学	聊城市采购中心	山东润博网络有限公司	246890.0		101	999	12	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
 
     '''
     for _s in tmp_s.split("\n"):
@@ -1272,4 +1276,6 @@ if __name__ == '__main__':
     # a.iterate(buffer,219957825,101,86400*4,"1","1","1","1","1","1","1",0,5,'{"time_bidclose": "", "time_bidopen": "2022-02-10", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-02-21", "time_publicity_start": "2022-02-11", "time_registration_end": "", "time_registration_start": "", "time_release": ""}')
     # a.iterate(buffer,219957825,101,86400*4,"1","1","1","1","1","1","1",0,5,'{"time_bidclose": "", "time_bidopen": "2022-02-10", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-02-22", "time_publicity_start": "2022-02-11", "time_registration_end": "", "time_registration_start": "", "time_release": ""}')
     print(a.terminate(buffer))
-    print(1)
+    print(1)
+
+    print(getSimilarityOfString('37168100014015220220012_40785671','SDGP371681000202201000912'))

+ 5 - 7
BiddingKG/readme/start.md

@@ -3,21 +3,19 @@
 #项目路径在/data/python/BiddingKG
 
 #11022启动要素提取接口
-#激活环境
-source activate py37
 #切换目录
 cd /data/python
 #关闭接口
 ps -ef | grep run_extract_server | grep -v grep | cut -c 9-16| xargs kill -9
 #启动接口
-nohup /data/anaconda3/envs/py37/bin/gunicorn -w 15 --limit-request-fields 0 --limit-request-line 0 -t 1000 -b 0.0.0.0:15030 run_extract_server:app >> extract.log &
+#nohup /data/anaconda3/envs/py37/bin/gunicorn -w 15 --limit-request-fields 0 --limit-request-line 0 -t 1000 --keep-alive 600 -b 0.0.0.0:15030 run_extract_server:app >> extract.log &
+nohup /data/anaconda3/envs/py37/bin/python run_extract_server.py >> extract.log port=15030 worker=14 &
 
 #19022启动要素提取接口
-#激活环境
-source activate py37
 #切换目录
-cd /data/python
+cd /data/python 
 #关闭接口
 ps -ef | grep run_extract_server | grep -v grep | cut -c 9-16| xargs kill -9
 #启动接口
-nohup /data/anaconda3/envs/py37/bin/gunicorn -w 6 --limit-request-fields 0 --limit-request-line 0 -t 1000 -b 0.0.0.0:15030 run_extract_server:app >> extract.log &
+#nohup /data/anaconda3/envs/py37/bin/gunicorn -w 5 --limit-request-fields 0 --limit-request-line 0 -t 1000  --keep-alive 600 -b 0.0.0.0:15030 run_extract_server:app >> extract.log &
+nohup /data/anaconda3/envs/py37/bin/python run_extract_server.py >> extract.log port=15030 worker=7 &

+ 42 - 5
BiddingKG/run_extract_server.py

@@ -17,11 +17,16 @@ os.environ["KERAS_BACKEND"] = "tensorflow"
 app = Flask(__name__)
 app.config['JSON_AS_ASCII'] = False
 
+limit_num = "4"
+os.environ["OMP_NUM_THREADS"] = limit_num # 1为一个核,设置为5的时候,系统显示用了10个核,不太清楚之间的具体数量关系
+os.environ["OMP_NUM_THREADS"] = limit_num # export OMP_NUM_THREADS=1
+os.environ["OPENBLAS_NUM_THREADS"] = limit_num # export OPENBLAS_NUM_THREADS=1
+os.environ["MKL_NUM_THREADS"] = limit_num # export MKL_NUM_THREADS=1
+os.environ["VECLIB_MAXIMUM_THREADS"] = limit_num # export VECLIB_MAXIMUM_THREADS=1
+os.environ["NUMEXPR_NUM_THREADS"] = limit_num # export NUMEXPR_NUM_THREADS=1
 
 import time
 import uuid
-from BiddingKG.dl.common.Utils import log
-from BiddingKG.dl.interface.extract import predict
 import numpy as np
 import ctypes
 import inspect
@@ -98,6 +103,9 @@ def run_thread(data,list_result):
 
 @app.route("/test",methods=['POST'])
 def test():
+    from BiddingKG.dl.common.Utils import log
+    from BiddingKG.dl.interface.extract import predict
+    global predict,log
     _time = time.time()
     a = request.form.get("content")
     log("get form takes %.2fs"%(time.time()-_time))
@@ -107,7 +115,9 @@ def test():
 
 @app.route('/content_extract', methods=['POST'])
 def text_predict():
-
+    from BiddingKG.dl.common.Utils import log
+    from BiddingKG.dl.interface.extract import predict
+    global predict,log
     _time = time.time()
     data = request.json
 
@@ -136,6 +146,7 @@ def text_predict():
 
 def getPort(argv):
     port = 15030
+    print(argv)
     for item in argv:
         _l = str(item).split("port=")
         if len(_l)>1:
@@ -143,8 +154,34 @@ def getPort(argv):
             break
     return port
 
-if __name__ == '__main__':
+def getWorkers(argv):
+    worker = 15
+    for item in argv:
+        _l = str(item).split("worker=")
+        if len(_l)>1:
+            worker = int(_l[-1])
+            break
+    return worker
+
+def start_with_tornado(port,process_num):
+    from tornado.wsgi import WSGIContainer
+    from tornado.httpserver import HTTPServer
+    from tornado.ioloop import IOLoop
+
+    http_server = HTTPServer(WSGIContainer(app))
+    # http_server.listen(port) #shortcut for bind and start
+    http_server.bind(port)
+    http_server.start(process_num)
+    IOLoop.instance().start()
+
+def start_with_flask():
     port = getPort(argv=sys.argv)
     app.run(host='0.0.0.0', port=port, threaded=True, debug=False)
     log("ContentExtractor running")
-    # app.run()
+    # app.run()
+
+if __name__ == '__main__':
+    port = getPort(argv=sys.argv)
+    workers = getWorkers(argv=sys.argv)
+    start_with_tornado(port,workers)
+    pass

+ 35 - 0
BiddingKG/test_deployment.py

@@ -0,0 +1,35 @@
+
+
+from flask import Flask,render_template
+from flask import request
+
+app = Flask(__name__)
+app.config['JSON_AS_ASCII'] = False
+
+@app.route("/test")
+def test():
+    data = request.json
+    j = 0
+    for i in range(10000):
+       j += i**2
+
+    return render_template("hello.html")
+
+@app.route("/render")
+def render():
+    return render_template("hello.html")
+
+
+def test_with_tornado():
+    from tornado.httpserver import HTTPServer
+    from tornado.wsgi import WSGIContainer
+    from tornado.ioloop import IOLoop
+
+    httpserver = HTTPServer(WSGIContainer(app))
+    httpserver.bind(15000)
+    httpserver.start(1)
+    IOLoop.instance().start()
+
+if __name__ == '__main__':
+    test_with_tornado()
+