Ver Fonte

Merge branch 'master' of http://192.168.2.65:3000/BIDI-ML/BIDI_ML_INFO_EXTRACTION

 Conflicts:
	BiddingKG/dl/interface/extract.py
luojiehua há 3 anos atrás
pai
commit
d5c0816575

+ 219 - 0
BiddingKG/dl/channel/re_channel_103.py

@@ -0,0 +1,219 @@
+import pandas as pd
+import re
+
+# 各投标人
+# 各潜在投标人
+# 各潜在投标人:
+# 致各招标文件持有者:
+# 致各投标人
+# 各潜在投标供应商:
+
+# 修改、澄清(答疑)纪要内容如下: 1、
+# 答疑澄清与修改的主要内容:
+# 对文件澄清与修改的主要内容
+# 澄清、修改内容要点
+# 答疑纪要
+# 答疑如下
+# 招标文件答疑和招标文件修改通知
+# 招标文件答疑通知
+# 答疑及补遗通知
+# 答疑回复如下:
+# 现对投标人提出的质疑回复如下:
+# 对文件澄清与修改的主要内容 详见招标文件
+# 修改的主要内容 详见附件
+# 澄清或修改事项:
+
+# 第1次答疑
+# 第1次答疑澄清
+
+# 答疑补遗文件
+# 补遗书澄清文件 答疑澄清
+# 质疑1
+# 问题
+# 答疑文件1
+# 具体补遗内容详见附件
+# 请问 答
+# 问题 回复
+# 答疑澄清公告 1:
+# 现对招标文件作如下澄清:
+# 详见答疑澄清文件
+# 详见答疑文件。
+
+
+channel_103 = '(澄清|答疑|补遗|修改)'
+channel_103_0 = '(致|至|)(各|各个)(潜在|)(投标|招标|招标文件持有|报价|竞选|)(人|者|供应商|单位)(:|:)'
+channel_103_1 = '(澄清|答疑|补遗|修改|质疑)(.?)(具体内容|主要内容|内容|回复|发布|纪要|事项|如下){1,2}(.?)' \
+                '(如下|[::]|详见|点击下载附件|[1一][::、]|(1)|\\(1\\)|一)'
+channel_103_2 = '第(.?)次(答疑|澄清)'
+channel_103_3 = '(澄清|答疑|补遗|修改)(公告|文件)'
+channel_103_after = '(请问|提问|问题|答复|回复|质疑|答|问){1,2}[12一]?[::]|[一1][::、]|(1)|\\(1\\)|(详见|见)(附件|答疑文件|澄清文件|答疑澄清文件)'
+channel_103_4 = '(补充答疑|提疑内容|请问|提问|问题|回复|答复|答疑|质疑|答|问)[12一]?[::]'
+channel_103_5 = '(见|详见)(答疑澄清文件|澄清文件|答疑文件)|补遗内容详见附件'
+
+# 答疑澄清时间
+# 对文件澄清与修改的主要内容 无澄清文件
+# 对文件澄清与修改的主要内容 无
+# 请各投标单位自行下载
+not_channel_103 = '答疑澄清时间|主要内容.?无|请各投标单位'
+
+
+def re_standard_channel_103(_str):
+    channel_103_list = []
+
+    if not re.search(channel_103, _str):
+        print("not")
+        return channel_103_list
+
+    reg_standard = "(?P<value>" + channel_103_0 + ")"
+    match = re.finditer(reg_standard, _str)
+    for m in match:
+        m_dict = m.groupdict()
+        m_span = m.span()
+        keyword_index = [m_span[0], m_span[1]]
+        keyword = m_dict.get('value')
+        channel_103_list.append([keyword, keyword_index])
+    if channel_103_list:
+        print("0", channel_103_list)
+        return channel_103_list
+
+    reg_standard = "(?P<value>" + channel_103_1 + ")"
+    match = re.finditer(reg_standard, _str)
+    for m in match:
+        m_dict = m.groupdict()
+        m_span = m.span()
+        keyword_index = [m_span[0], m_span[1]]
+        keyword = m_dict.get('value')
+        channel_103_list.append([keyword, keyword_index])
+    if channel_103_list:
+        print("1", channel_103_list)
+        return channel_103_list
+
+    reg_standard = "(?P<value>" + channel_103_2 + ")"
+    match = re.finditer(reg_standard, _str)
+    for m in match:
+        m_dict = m.groupdict()
+        m_span = m.span()
+        keyword_index = [m_span[0], m_span[1]]
+        keyword = m_dict.get('value')
+        if re.search(channel_103_after, _str[keyword_index[1]:keyword_index[1]+50]):
+            channel_103_list.append([keyword, keyword_index])
+    if channel_103_list:
+        print("2", channel_103_list)
+        return channel_103_list
+
+    reg_standard = "(?P<value>" + channel_103_3 + ")"
+    match = re.finditer(reg_standard, _str)
+    for m in match:
+        m_dict = m.groupdict()
+        m_span = m.span()
+        keyword_index = [m_span[0], m_span[1]]
+        keyword = m_dict.get('value')
+        if re.search(channel_103_after, _str[keyword_index[1]:keyword_index[1]+50]):
+            channel_103_list.append([keyword, keyword_index])
+    if channel_103_list:
+        print("3", channel_103_list)
+        return channel_103_list
+
+    reg_standard = "(?P<value>" + channel_103_4 + ")"
+    match = re.finditer(reg_standard, _str)
+    for m in match:
+        m_dict = m.groupdict()
+        m_span = m.span()
+        keyword_index = [m_span[0], m_span[1]]
+        keyword = m_dict.get('value')
+        channel_103_list.append([keyword, keyword_index])
+    if channel_103_list:
+        print("4", channel_103_list)
+        return channel_103_list
+
+    reg_standard = "(?P<value>" + channel_103_5 + ")"
+    match = re.finditer(reg_standard, _str)
+    for m in match:
+        m_dict = m.groupdict()
+        m_span = m.span()
+        keyword_index = [m_span[0], m_span[1]]
+        keyword = m_dict.get('value')
+        channel_103_list.append([keyword, keyword_index])
+    if channel_103_list:
+        print("5", channel_103_list)
+        return channel_103_list
+
+    return channel_103_list
+
+
+def re_not_channel_103(_str):
+    match = re.findall(not_channel_103, _str)
+    if match:
+        for word in match:
+            instead = "#" * len(word)
+            _str = re.sub(word, instead, _str)
+    return _str
+
+
+def re_channel_103(text):
+    # 替换易混淆词
+    clean_text = re_not_channel_103(text)
+
+    # 查找符合标准形式的
+    channel_103_list = re_standard_channel_103(clean_text)
+    return channel_103_list
+
+
+def extract_channel_103(text):
+    result_list = []
+    channel_103_list = re_channel_103(text)
+    if channel_103_list:
+        for word, text_index in channel_103_list:
+            if word is not None:
+                if text_index[1]-text_index[0] != len(word) \
+                        or text_index[1]-text_index[0] >= 20:
+                    return []
+                d = {"body": word, "begin_index": text_index[0], "end_index": text_index[1]}
+                result_list.append(d)
+    return result_list
+
+
+def test_csv(_path):
+    df = pd.read_csv(_path)
+
+    predict_list = []
+    for index, row in df.iterrows():
+        word_list = re_channel_103(row["doctextcon"], "")
+        if word_list:
+            predict = word_list
+        else:
+            predict = []
+        print("predict", predict)
+        predict_list.append(str(predict))
+
+    predict_df = pd.DataFrame(predict_list)
+    df = pd.concat([df, predict_df], axis=1)
+
+    df.to_csv(_path)
+    print("finish write!")
+
+
+def test_str():
+    s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
+    s = '''
+    (第1次澄清) 发布时间:2020-11-25 致各招标文件持有者: 招标人──舟山市
+    '''
+    print(extract_channel_103(s))
+
+
+def test_html():
+    html_path = "C:/Users/Administrator/Desktop/3.html"
+
+    with open(html_path, "r") as f:
+        s = f.read()
+
+    print(extract_channel_103(s, title=""))
+
+
+if __name__ == "__main__":
+    path = "D:\\BIDI_DOC\\比地_文档\\澄清答疑_result.csv"
+    # test_csv(path)
+    test_str()
+    # test_html(path)
+    pass
+

+ 20 - 1
BiddingKG/dl/entityLink/entityLink.py

@@ -62,6 +62,15 @@ def link_entitys(list_entitys,on_value=0.8):
                         if len(_ent.entity_text)>len(_entity.entity_text):
                             _entity.entity_text = _ent.entity_text
 
+        # 2021/12/21 替换通过字典识别到的取长度最大的相似实体
+        for _entity in range_entity:
+            for _ent in _entity.linked_entitys:
+                print("_entity, _ent", _entity.entity_text, _ent.if_dict_match, _ent.entity_text)
+                if re.search("公司$", _ent.entity_text) is not None \
+                        and _ent.if_dict_match == 1:
+                    if len(_ent.entity_text) > len(_entity.entity_text):
+                        _entity.entity_text = _ent.entity_text
+
 
 def getEnterprisePath():
     filename = "../LEGAL_ENTERPRISE.txt"
@@ -146,6 +155,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
         for p_sentence in list_sentence:
             sentence = p_sentence.sentence_text
             list_match = match_enterprise_max_first(sentence)
+            print("list_match", list_match)
 
             doc_id = p_sentence.doc_id
             sentence_index = p_sentence.sentence_index
@@ -164,10 +174,14 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                     if p_entity.entity_type=="location" and p_entity.entity_text==_match["entity_text"]:
                         find_flag = True
                         p_entity.entity_type = "company"
+                        p_entity.if_dict_match = 1
 
                     if p_entity.entity_type not in ["location","org","company"]:
                         continue
 
+                    if _match["entity_text"] == p_entity.entity_text:
+                        p_entity.if_dict_match = 1
+
                     #有重叠
                     #match部分被包含则不处理
                     if _match["begin_index"]>=p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
@@ -189,6 +203,8 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                             p_entity.wordOffset_end = _match["end_index"]
                             p_entity.begin_index = begin_index
                             p_entity.end_index = end_index
+                            # 该公司实体是字典识别的
+                            p_entity.if_dict_match = 1
 
                             for _match_h in range(_match_index+1,_match_j+1):
                                 entity_text = list_match[_match_h]["entity_text"]
@@ -198,6 +214,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                                 end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"])
                                 entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
                                 add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"])
+                                add_entity.if_dict_match = 1
                                 list_entity.append(add_entity)
 
                                 range_entity.append(add_entity)
@@ -225,6 +242,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                                     p_entity.wordOffset_end = _match["end_index"]
                                     p_entity.begin_index = begin_index
                                     p_entity.end_index = end_index
+                                    p_entity.if_dict_match = 1
                         elif _match["end_index"]>=p_entity.wordOffset_end:
                             match_replace = True
                             begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
@@ -236,6 +254,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                             p_entity.begin_index = begin_index
                             p_entity.end_index = end_index
                             p_entity.entity_type = "company"
+                            p_entity.if_dict_match = 1
                     elif _match["begin_index"]<p_entity.wordOffset_end and _match["end_index"]>p_entity.wordOffset_end:
                         find_flag = True
                         if p_entity.entity_type in ("org","company"):
@@ -248,12 +267,12 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                             p_entity.wordOffset_end = _match["end_index"]
                             p_entity.begin_index = begin_index
                             p_entity.end_index = end_index
+                            p_entity.if_dict_match = 1
                 if not find_flag:
                     match_add = True
                     entity_text = _match["entity_text"]
                     entity_type = "company"
 
-
                     begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
                     end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
                     entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)

+ 231 - 0
BiddingKG/dl/if_joint_bidding/re_if_joint_bidding.py

@@ -0,0 +1,231 @@
+import pandas as pd
+import re
+
+# 申请人可以组成联合体报名,联合体的家数最多不能超过两家
+# 本项目不接受供应商以联合体方式进行投标。
+
+bidway = '(参与|)(报价|投标|招标|竞价|报名|参加|资格预审|应答|谈判|磋商|竞标)(登记|)'
+
+# 是否接收联合体投标: 不接受
+# 联合体投标: 不允许
+# 是否允许联合体投标登记:是
+# (是/否)接受联合体投标:否
+# 是否接受联合体投标 不接受
+# 是否接受联合体投标:不接受
+# 本项目(是/否)接受联合体投标:否
+# joint_bidding_prefix_1 = '(不[ ]?|[((]{0,1}[ ]?[是否不][ ]?[))]{0,1}|)'
+joint_bidding_prefix_1 = "(是否|)"
+bidway_1 = bidway
+joint_bidding_body_1 = '(允许|接受|接收|)(联合体|独立体或联合体)' + bidway_1
+joint_bidding_suffix_1 = '([ ::。]{1,2})(不接受|不接收|不允许|允许|接受|接收|是|否)'
+
+# 不接受(接受或不接受)联合体投标
+# (否)接受联合体。
+# (不)接受联合体投标
+# ( 不 )接受联合体。
+# 本项目 不 允许联合体投标。
+# (否)接受联合体投标
+# 本项目不接受联合体参与投标。
+# 本合同包接受联合体投标
+# 本项目不接受联合体应答,
+# 不接受联合体投标
+# 否 接受联合体
+# 接受 联合体资格预审
+# 接受独立体或联合体报名,联合体的家数最多不能超过两家
+joint_bidding_prefix_2 = '(不[ ]?|[((]{0,1}[ ]?[是否不][ ]?[))]{0,1}|)'
+bidway_2 = "(" + bidway + "|)"
+joint_bidding_body_2 = '(允许|接受|接收).?(联合体|独立体或联合体)' + bidway_2
+joint_bidding_suffix_2 = '([ ::。]{0,2})(不接受|不接收|不允许|允许|接受|接收|是|否|)'
+# joint_bidding_suffix_2 = ""
+
+# 是否允许联合体 不允许
+joint_bidding_prefix_3 = '(是否)'
+joint_bidding_body_3 = '(允许|接受|接收).?(联合体|独立体或联合体)'
+joint_bidding_suffix_3 = '([ ::。]{1,2})(不接受|不接收|不允许|允许|接受|接收|是|否)'
+
+
+# 是否接受联合体投标:( )是(√ )否。
+
+
+# 投标人须知前附表规定接受联合体投标的
+# 联合体投标的,
+# 允许联合体投标的
+# 如项目接受联合体投标
+# (是/否)接受联合体投标: 是 否
+# 招标□接受 ?不接受联合体投标
+# 联合体投标:接受;不接受
+# (是/否)
+# 是 否
+# 接受;不接受
+# 接受 ?不接受
+# (接受或不接受)
+# 是否允许联合体: 1 是 0 否
+# 允许联合体报名 □是 ■ 否
+not_joint_bidding_1 = '(' \
+                      '联合体投标的|如项目接受联合体投标' \
+                      '|是否允许联合体: 1 是 0 否' \
+                      '|联合体参加的|联合体牵头人|联合体牵头方|联合体成员|联合体(牵头人)' \
+                      '|联合体各方|联合体协议' \
+                      '|允许联合体报名 □是 ■ 否' \
+                      ')'
+not_joint_bidding_2 = '(' \
+                      '[((]{0,1}.?是.{1,2}否[))]{0,1}' \
+                      '|[((]{0,1}.?接受.{0,2}不接受[))]{0,1}' \
+                      '|1 是 0 否' \
+                      '|.{1}接受.{1,2}不接受' \
+                      ')'
+
+
+def re_not_joint_bidding(_str):
+    _str = re.sub(not_joint_bidding_1, "", _str)
+    _str = re.sub(not_joint_bidding_2, "", _str)
+    return _str
+
+
+def re_standard_joint_bidding(_str):
+    # 第一种形式
+    reg_standard = "(?P<prefix>" + joint_bidding_prefix_1 + ")" \
+                   + "(?P<body>" + joint_bidding_body_1 + ")" \
+                   + "(?P<suffix>" + joint_bidding_suffix_1 + ")"
+    # print("prefix", re.findall(joint_bidding_prefix_1, _str))
+    # print("body", re.search(joint_bidding_body_1, _str))
+    # print("suffix", re.search(joint_bidding_suffix_1, _str))
+    match = re.finditer(reg_standard, _str)
+    joint_bidding_list = []
+    for m in match:
+        m_dict = m.groupdict()
+        m_span = m.span()
+        keyword = ""
+        keyword += m_dict.get("prefix") + m_dict.get("body") + m_dict.get("suffix")
+        joint_bidding_list.append([keyword, m_span[0], m_span[1]])
+    if joint_bidding_list:
+        return joint_bidding_list
+
+    # 第二种形式
+    reg_standard = "(?P<prefix>" + joint_bidding_prefix_2 + ")" \
+                   + "(?P<body>" + joint_bidding_body_2 + ")" \
+                   + "(?P<suffix>" + joint_bidding_suffix_2 + ")"
+    match = re.finditer(reg_standard, _str)
+    # print("prefix", re.findall(joint_bidding_prefix_2, _str))
+    # print("body", re.search(joint_bidding_body_2, "接受 联合体资格预审"))
+    # print("suffix", re.search(joint_bidding_suffix_2, _str))
+    joint_bidding_list = []
+    for m in match:
+        m_dict = m.groupdict()
+        m_span = m.span()
+        keyword = ""
+        keyword += m_dict.get("prefix") + m_dict.get("body") + m_dict.get("suffix")
+        # 排除 '是否' 的情况
+        if _str[m_span[0]-1:m_span[0]] != "是":
+            joint_bidding_list.append([keyword, [m_span[0], m_span[1]]])
+    if joint_bidding_list:
+        return joint_bidding_list
+
+    # 第三种形式
+    reg_standard = "(?P<prefix>" + joint_bidding_prefix_3 + ")" \
+                   + "(?P<body>" + joint_bidding_body_3 + ")" \
+                   + "(?P<suffix>" + joint_bidding_suffix_3 + ")"
+    match = re.finditer(reg_standard, _str)
+    # print("prefix", re.findall(joint_bidding_prefix_2, _str))
+    # print("body", re.search(joint_bidding_body_2, "接受 联合体资格预审"))
+    # print("suffix", re.search(joint_bidding_suffix_2, _str))
+    joint_bidding_list = []
+    for m in match:
+        m_dict = m.groupdict()
+        m_span = m.span()
+        keyword = ""
+        keyword += m_dict.get("prefix") + m_dict.get("body") + m_dict.get("suffix")
+        joint_bidding_list.append([keyword, [m_span[0], m_span[1]]])
+    if joint_bidding_list:
+        return joint_bidding_list
+
+    return joint_bidding_list
+
+
+def re_joint_bidding(text):
+    # 替换小括号
+    text_clean = re.sub("\\(", "(", text)
+    text_clean = re.sub("\\)", ")", text_clean)
+
+    # 替换易混淆词
+    text_clean = re_not_joint_bidding(text_clean)
+    # print("clean", text_clean)
+
+    # 查找符合标准形式的
+    joint_bidding_list = re_standard_joint_bidding(text_clean)
+    return joint_bidding_list
+
+
+def judge_joint_bidding(_list):
+    new_list = []
+    for l in _list:
+        if "否" in l[0] or "不" in l[0]:
+            new_list.append(["0" + " " + l[0], l[1]])
+        else:
+            new_list.append(["1" + " " + l[0], l[1]])
+
+    return new_list
+
+
+def extract_joint_bidding(text):
+    result_list = []
+    joint_bidding_list = re_joint_bidding(text)
+    joint_bidding_list = judge_joint_bidding(joint_bidding_list)
+    if joint_bidding_list:
+        for word, text_index in joint_bidding_list:
+            if word is not None:
+                d = {"body": word, "begin_index": text_index[0], "end_index": text_index[1]}
+                result_list.append(d)
+    return result_list
+
+
+def test_csv(_path):
+    df = pd.read_csv(_path)
+
+    predict_list = []
+    for index, row in df.iterrows():
+        word_list = re_joint_bidding(row["doctextcon"])
+        if word_list:
+            predict = word_list
+        else:
+            predict = []
+        print("predict", predict)
+        predict_list.append(str(predict))
+
+    predict_df = pd.DataFrame(predict_list)
+    df = pd.concat([df, predict_df], axis=1)
+
+    df.to_csv(_path)
+    print("finish write!")
+
+
+def test_str():
+    # (不)接受联合体投标
+    # 本项目不接受供应商以联合体方式进行投标。
+    # (否)接受联合体。
+    # 是否接收联合体投标: 不接受
+    # 联合体投标: 不允许
+    # 是否允许联合体投标登记:是
+    s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
+    s = '''
+测绘服务 是否允许联合体 不允许 行业
+    '''
+    print(extract_joint_bidding(s))
+
+
+def test_html(_path):
+    html_path = _path
+
+    with open(html_path, "r") as f:
+        s = f.read()
+
+    print(extract_joint_bidding(s, title=""))
+
+
+if __name__ == "__main__":
+    path = "D:\\BIDI_DOC\\比地_文档\\投标工期_result.csv"
+    test_csv(path)
+    # test_str()
+    # test_html(path)
+    pass
+

+ 1 - 0
BiddingKG/dl/interface/Entitys.py

@@ -169,6 +169,7 @@ class Entity():
         self.is_tail = False
         self.notes = ''  # 2021/7/20 新增,保存金额大小写,单位等备注
         self.money_unit = '' #2021/8/17 新增,保存金额单位 元、万元 、亿元
+        self.if_dict_match = 0 # 2021/12/21 新增,判断公司实体是否由字典识别得到
 
     def set_Role(self,role_label,role_values):
         self.label = int(role_label)

+ 45 - 10
BiddingKG/dl/interface/Preprocessing.py

@@ -110,6 +110,8 @@ def tableToText(soup):
         for tr in trs:
             tr_line = []
             tds = tr.findChildren(['td','th'], recursive=False)
+            if len(tds)==0:
+                tr_line.append([re.sub('\xa0','',segment(tr,final=False)),0]) # 2021/12/21 修复部分表格没有td 造成数据丢失
             for td in tds:
                 tr_line.append([re.sub('\xa0','',segment(td,final=False)),0])
                 #tr_line.append([td.get_text(),0])
@@ -660,9 +662,9 @@ def tableToText(soup):
                 for i in range(head_begin,head_end):
                     for w in range(len(inner_table[i])):
                         if inner_table[i][w][1]==1:
-                            _punctuation = ":"
+                            _punctuation = ""
                         else:
-                            _punctuation = ","
+                            _punctuation = ","  #2021/12/15 统一为中文标点,避免 206893924 国际F座1108,1,009,197.49元
                         if w>0:
                             if inner_table[i][w][0]!= inner_table[i][w-1][0]:
                                 text_line += inner_table[i][w][0]+_punctuation
@@ -994,15 +996,16 @@ def tableToText(soup):
     pat_value = re.compile("(\d{2,}.\d{1}|\d+年\d+月|\d{8,}|\d{3,}-\d{6,}|有限[责任]*公司|^\d+$)")
 
     list_innerTable = []
-    tbodies = soup.find_all('tbody')
+
+    tbodies = soup.find_all('table')
     # 遍历表格中的每个tbody
     #逆序处理嵌套表格
     for tbody_index in range(1,len(tbodies)+1):
         tbody = tbodies[len(tbodies)-tbody_index]
         inner_table = trunTable(tbody)
         list_innerTable.append(inner_table)
-    '''2021/10/19先找tbody 再找table,避免一个table内多个tbody造成数据丢失'''
-    tbodies = soup.find_all('table')
+
+    tbodies = soup.find_all('tbody')
     # 遍历表格中的每个tbody
     #逆序处理嵌套表格
     for tbody_index in range(1,len(tbodies)+1):
@@ -1081,7 +1084,7 @@ def segment(soup,final=True):
     # 感叹号替换为中文句号
     text = re.sub("(?<=[\u4e00-\u9fa5])[!!]|[!!](?=[\u4e00-\u9fa5])","。",text)
     #替换"?"为 " " ,update:2021/7/20
-    text = re.sub("?"," ",text)
+    text = re.sub("?{1,}"," ",text)
 
 
     #替换"""为"“",否则导入deepdive出错
@@ -1124,7 +1127,7 @@ def segment(soup,final=True):
                 if ":" in punc_del.strip():
                     text = re.sub(punc_del,":",text)
                 else:
-                    text = re.sub(punc_del,punc_del.strip()[-1],text)
+                    text = re.sub(punc_del,punc_del.strip()[0],text)   #2021/12/09 修正由于某些标签后插入符号把原来符号替换
             else:
                 text = re.sub(punc_del,"",text)
         
@@ -1486,6 +1489,9 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         key_preprocess = "tableToText"
         start_time = time.time()
         article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
+        article_processed = article_processed.replace('.','.') # 2021/12/01 修正OCR识别PDF小数点错误问题
+        article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改
+        article_processed = article_processed.replace('成交工程价款', '成交工程价')  # 2021/12/21 修正为中标价
 
         # 提取bidway
         list_bidway = extract_bidway(article_processed, _title)
@@ -1667,6 +1673,21 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
             ner_entitys = ner_entitys_all[sentence_index]
 
+            '''正则识别角色实体  经营部|经销部|电脑部|服务部|复印部|印刷部|彩印部|装饰部|修理部|汽修部|修理店|零售店|设计店|服务店|家具店|专卖店|分店|文具行|商行|印刷厂|修理厂|维修中心|修配中心|养护中心|服务中心|会馆|文化馆|超市|门市|商场|家具城|印刷社|经销处'''
+            for it in re.finditer(
+                    '(?P<text_key_word>[^,。、;《]{,5}(单一来源|中标|中选|中价|成交)?(供应商|供货商|服务商|候选人|单位|人)(名称)?为?[::]+)(?P<text>([^,。、;《]{5,20})(厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处))[,。]',
+                    sentence_text):
+                for k, v in it.groupdict().items():
+                    if k == 'text_key_word':
+                        keyword = v
+                    if k == 'text':
+                        entity = v
+                b = it.start() + len(keyword)
+                e = it.end() - 1
+                if (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
+                    ner_entitys.append((b, e, 'company', entity))
+                    # print('正则新增 :',(b, e, 'company', entity))
+
 
             #识别package
 
@@ -1718,9 +1739,9 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
             #                       "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万元]+)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿元]*)())",
             #                       "behind_m":"(()()(?P<money_behind_m>[0-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]*)[\((]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只]*))[\))]?)"}
             list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                                  "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|预算|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果)(?:[,,(\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?元?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万亿]?元?(?P<filter_unit1>[台只吨斤棵株页亩方条]*))\s*[)\)]?))",
+                                  "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|预算|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,,(\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?元?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万亿]?元?(?P<filter_unit1>[台只吨斤棵株页亩方条]*))\s*[)\)]?))",
                                   "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?元)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)())",
-                                  "behind_m":"(()()(?P<money_behind_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)[\((]?(?P<unit_behind_m>[万亿]?元(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
+                                  "behind_m":"(()()(?P<money_behind_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)(人民币)?[\((]?(?P<unit_behind_m>[万亿]?元(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
             # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
 
             pattern_money = re.compile("%s|%s|%s|%s"%(list_money_pattern["cn"],list_money_pattern["key_word"],list_money_pattern["behind_m"],list_money_pattern["front_m"]))
@@ -1879,13 +1900,27 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                         notes = '总投资'
                     elif re.search('投资', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/11/18 投资金额不作为招标金额
                         notes = '投资'
+                    elif re.search('工程造价', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/12/20 工程造价不作为招标金额
+                        notes = '工程造价'
+                    elif (re.search('保证金', sentence_text[max(0, _match.span()[0] - 5):_match.span()[1]])
+                          or re.search('保证金的?(缴纳)?(金额|金\?|额|\?)?[\((]*(万?元|为?人民币|大写|调整|变更|已?修改|更改|更正)?[\))]*[::为]',
+                                       sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]])
+                          or re.search('保证金由[\d.,]+.{,3}(变更|修改|更改|更正|调整?)为',
+                                       sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])):
+                        notes = '保证金'
+                        # print('保证金信息:', sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])
+                    elif re.search('成本(警戒|预警)(线|价|值)[^0-9元]{,10}',
+                                   sentence_text[max(0, _match.span()[0] - 10):_match.span()[0]]):
+                        notes = '成本警戒线'
                     elif re.search('(监理|设计|勘察)(服务)?费(报价)?[约为:]', sentence_text[_match.span()[0]:_match.span()[1]]):
                         cost_re = re.search('(监理|设计|勘察)(服务)?费', sentence_text[_match.span()[0]:_match.span()[1]])
                         notes = cost_re.group(1)
-                    elif re.search('单价', sentence_text[_match.span()[0]:_match.span()[1]]):
+                    elif re.search('单价|总金额', sentence_text[_match.span()[0]:_match.span()[1]]):
                         notes = '单价'
                     elif re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆]', entity_text) != None:
                         notes = '大写'
+                        if entity_text[0] == "拾":  # 2021/12/16 修正大写金额省略了数字转换错误问题
+                            entity_text = "壹"+entity_text
                         # print("补充备注:notes = 大写")
                     if len(unit)>0:
                         if unit.find('万')>=0 and len(entity_text.split('.')[0])>=8: # 2021/7/19 修正万元金额过大的情况

+ 76 - 29
BiddingKG/dl/interface/extract.py

@@ -4,7 +4,6 @@ Created on 2019年1月4日
 @author: User
 '''
 import os
-
 from bs4 import BeautifulSoup, Comment
 import copy
 import re
@@ -24,10 +23,11 @@ import BiddingKG.dl.interface.Preprocessing as Preprocessing
 import BiddingKG.dl.interface.getAttributes as getAttributes
 import BiddingKG.dl.complaint.punish_predictor as punish_rule
 import json
+from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
+from BiddingKG.dl.ratio.re_ratio import extract_ratio
 
 
-
-#自定义jsonEncoder
+# 自定义jsonEncoder
 class MyEncoder(json.JSONEncoder):
     def default(self, obj):
         if isinstance(obj, np.ndarray):
@@ -41,6 +41,7 @@ class MyEncoder(json.JSONEncoder):
             return obj
         return json.JSONEncoder.default(self, obj)
 
+
 def predict(doc_id,text,title="",page_time="",**kwargs):
     cost_time = dict()
 
@@ -51,47 +52,49 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
     cost_time["preprocess"] = round(time.time()-start_time,2)
     cost_time.update(_cost_time)
 
-
-
-    #依赖句子顺序
-    start_time = time.time()
+    # 依赖句子顺序
+    start_time = time.time() # 公告类型/生命周期提取
     list_channel_dic = predictor.getPredictor("channel").predict(title=title, content=list_sentences[0])
     cost_time["channel"] = round(time.time()-start_time,2)
 
-    start_time = time.time()
+    start_time = time.time() # 项目编号、名称提取
     codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
     log("get codename done of doc_id%s"%(doc_id))
     cost_time["codename"] = round(time.time()-start_time,2)
 
-    start_time = time.time()
+    start_time = time.time() # 角色金额模型提取
     predictor.getPredictor("prem").predict(list_sentences,list_entitys)
     log("get prem done of doc_id%s"%(doc_id))
     cost_time["prem"] = round(time.time()-start_time,2)
 
-    start_time = time.time()
+    start_time = time.time() # 产品名称及废标原因提取
     predictor.getPredictor("product").predict(list_sentences,list_entitys)
     log("get product done of doc_id%s"%(doc_id))
     cost_time["product"] = round(time.time()-start_time,2)
 
-    start_time = time.time()
+    start_time = time.time() # 产品相关要素正则提取 单价、数量、品牌规格 ; 项目、需求、预算、时间
     product_attrs = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
     log("get product attributes done of doc_id%s"%(doc_id))
     cost_time["product_attrs"] = round(time.time()-start_time,2)
 
-    # start_time = time.time()
-    # predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
-    # cost_time["rule"] = round(time.time()-start_time,2)
+    start_time = time.time() # 正则角色提取
+    predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
+    cost_time["rule"] = round(time.time()-start_time,2)
 
-    start_time = time.time()
+    start_time = time.time() # 联系人模型提取
     predictor.getPredictor("epc").predict(list_sentences,list_entitys)
     log("get epc done of doc_id%s"%(doc_id))
     cost_time["person"] = round(time.time()-start_time,2)
 
-    start_time = time.time()
+    start_time = time.time() # 时间类别提取
     predictor.getPredictor("time").predict(list_sentences, list_entitys)
     log("get time done of doc_id%s"%(doc_id))
     cost_time["time"] = round(time.time()-start_time,2)
 
+    start_time = time.time() # 保证金支付方式
+    payment_way_dic = predictor.getPredictor("deposit_payment_way").predict(content=list_articles[0].content)
+    cost_time["deposit"] = round(time.time()-start_time,2)
+
     # 需在getPredictor("prem")后  getAttributes.getPREMs 前
     if len(re.findall('监理|施工|设计|勘察', title))==1 and re.search('施工|总承包|epc|EPC',title)==None:
         keyword = re.search('监理|设计|勘察', title).group(0)
@@ -106,14 +109,49 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
                         _entity.values[1] = 0.51
                         _entity.set_Money(1, _entity.values)
 
-    #依赖句子顺序
-    start_time = time.time()
+    # 2021-12-08新增:提取:总价,单价,比率
+    total_money_list = []
+    unit_money_list = []
+    ratio_list = []
+    for i in range(len(list_entitys)):
+        list_entity = list_entitys[i]
+
+        # 总价单价
+        for _entity in list_entity:
+            if _entity.entity_type == 'money':
+                word_of_sentence = list_sentences[i][_entity.sentence_index].sentence_text
+                # 总价在中投标金额中
+                if _entity.label == 1:
+                    result = extract_total_money(word_of_sentence,
+                                                 _entity.entity_text,
+                                                 [_entity.wordOffset_begin, _entity.wordOffset_end])
+                    if result:
+                        total_money_list.append(result)
+
+                # 单价在普通金额中
+                else:
+                    result = extract_unit_money(word_of_sentence,
+                                                _entity.entity_text,
+                                                [_entity.wordOffset_begin, _entity.wordOffset_end])
+                    if result:
+                        unit_money_list.append(result)
+
+        # 比率
+        all_sentence = ""
+        for sentence in list_sentences[i]:
+            all_sentence += sentence.sentence_text + ","
+        result = extract_ratio(all_sentence)
+        if result:
+            ratio_list.append(result)
+
+    # 依赖句子顺序
+    start_time = time.time() # 实体链接
     entityLink.link_entitys(list_entitys)
     prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
     log("get attributes done of doc_id%s"%(doc_id))
     cost_time["attrs"] = round(time.time()-start_time,2)
 
-    start_time = time.time()
+    start_time = time.time() # 失信数据要素提取
     list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
     cost_time["punish"] = round(time.time()-start_time,2)
 
@@ -123,21 +161,25 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
                 if product in d['project_name']:
                     d['product'].append(product)  #把产品在项目名称中的添加进需求要素中
 
-    #print(prem)
+    # print(prem)
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    data_res = dict(codeName[0], **prem[0], **list_channel_dic[0], **product_attrs[0], **product_attrs[1])
+    data_res = dict(codeName[0], **prem[0], **list_channel_dic[0], **product_attrs[0], **product_attrs[1], **payment_way_dic)
     data_res["cost_time"] = cost_time
     data_res["success"] = True
 
-    for _article in list_articles:
-        log(_article.content)
+    data_res["total_money"] = total_money_list
+    data_res["unit_money"] = unit_money_list
+    data_res["ratio"] = ratio_list
 
-    for list_entity in list_entitys:
-        for _entity in list_entity:
-            log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
-                  (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
-                   str(_entity.begin_index),str(_entity.end_index)))
+    # for _article in list_articles:
+    #     log(_article.content)
+    #
+    # for list_entity in list_entitys:
+    #     for _entity in list_entity:
+    #         log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
+    #               (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
+    #                str(_entity.begin_index),str(_entity.end_index)))
 
     return json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
 
@@ -169,9 +211,14 @@ if __name__=="__main__":
     #     print(rs['product_attrs'])
     # print(rs)
 
-    with open('D:/html/138786703.html', 'r', encoding='utf-8') as f:
+    with open('D:/html/2.html', 'r', encoding='utf-8') as f:
         text = f.read()
+        t1 = time.time()
+        print(predict('', text, title))
+        t2 = time.time()
         print(predict('', text, title))
+        t3 = time.time()
+        print('第一次耗时:%.4f, 第二次耗时:%.4f'%(t2-t1, t3-t2))
     # print(predict('',text,title))
 
     # df = pd.read_excel('E:/大网站规则识别/大网站要素提取结果2.xlsx')[:]

+ 225 - 32
BiddingKG/dl/interface/getAttributes.py

@@ -942,7 +942,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                                     packageName_entity = "Project"
                                 if str(entity.label) in ["2","3","4"]:
                                     # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
-                                    if entity_after.notes == '单价':
+                                    if entity_after.notes == '单价' or float(entity_after.entity_text)<5000: #2021/12/17 调整小金额阈值,避免203608823.html 两次金额一次万元没提取到的情况
                                         addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
                                                          0.5)
                                         entity.pointer_money = entity_after
@@ -1129,10 +1129,10 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                 if not re.search("电,?话", phone_left):
                     last_phone_mask = False
                     continue
-            if re.search("注册[证号]|帐,?号|编,?[号码]|报,?价|证,?号|价,?格|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", phone_left):
+            if re.search("注册[证号]|帐,?号|编,?[号码]|报,?价|标,?价|证,?号|价,?格|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", phone_left):
                 last_phone_mask = False
                 continue
-            if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+", phone_right):
+            if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+|元", phone_right):
                 last_phone_mask = False
                 continue
             # if:上一个phone实体不符合条件
@@ -1989,7 +1989,30 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
     while(p_entity>=0):
         entity = list_entity[p_entity]
         if entity.entity_type=="money":
-            if entity.values[entity.label]>=on_value:
+            # 2021/12/03 添加成本警戒线、保证金
+            if entity.notes in ['保证金', '成本警戒线']:
+                packagePointer, _flag = getPackage(PackageList, entity.sentence_index, entity.begin_index,
+                                                   "money-" + str(entity.label), MAX_DIS=2, DIRECT="L")
+                if packagePointer is None:
+                    packageName = "Project"
+                else:
+                    packageName = packagePointer.entity_text
+
+                if packageName == "Project":
+                    # if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
+                    #     PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
+                    if entity.notes=="保证金" and "bond" not in PackDict["Project"]:
+                        PackDict["Project"]["bond"] = float(entity.entity_text)
+                    elif entity.notes=="成本警戒线" and "cost_warning" not in PackDict["Project"]:
+                        PackDict["Project"]["cost_warning"] = float(entity.entity_text)
+
+                else:
+                    if entity.notes == "保证金" and "bond" not in PackDict[packageName]:
+                        PackDict[packageName]["bond"] = float(entity.entity_text)
+                    elif entity.notes == "成本警戒线" and "cost_warning" not in PackDict[packageName]:
+                        PackDict[packageName]["cost_warning"] = float(entity.entity_text)
+
+            elif entity.values[entity.label]>=on_value:
                 if str(entity.label)=="1":
                     set_tenderer_money.add(float(entity.entity_text))
                     list_tenderer_money.append(float(entity.entity_text))  # 2021/7/16 新增列表,倒序保存所有中标金额
@@ -2172,21 +2195,191 @@ def turnBidWay(bidway):
     else:
         return "其他"
 
+my_time_format_pattern = re.compile("((?P<year>\d{4}|\d{2})\s*[-\/年\.]\s*(?P<month>\d{1,2})\s*[-\/月\.]\s*(?P<day>\d{1,2}))")
+import time
+def my_timeFormat(_time):
+    current_year = time.strftime("%Y",time.localtime())
+    all_match = re.finditer(my_time_format_pattern,_time)
+    time_list = []
+    for _match in all_match:
+        if len(_match.group())>0:
+            legal = True
+            year = ""
+            month = ""
+            day = ""
+            for k,v in _match.groupdict().items():
+                if k=="year":
+                    year = v
+                if k=="month":
+                    month = v
+                if k=="day":
+                    day = v
+            if year!="":
+                if len(year)==2:
+                    year = "20"+year
+                if int(year)>int(current_year):
+                    legal = False
+            else:
+                legal = False
+            if month!="":
+                if int(month)>12:
+                    legal = False
+            else:
+                legal = False
+            if day!="":
+                if int(day)>31:
+                    legal = False
+            else:
+                legal = False
+            if legal:
+                # return "%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0"))
+                time_list.append("%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0")))
+    return time_list
+
+def getTimeAttributes(list_entity,list_sentence):
+    # list_entity = [i for i in list_entity if i.entity_type=='time']
+    list_sentence = sorted(list_sentence,key=lambda x:x.sentence_index)
+    dict_time = {
+        "time_release": [],
+        "time_bidopen": [],
+        "time_bidclose": [],
+        'time_bidstart': [],  # 12 投标(开始)时间、响应文件接收(开始)时间
+
+        'time_publicityStart': [],  # 4 公示开始时间(公示时间、公示期)
+        'time_publicityEnd': [],  # 5 公示截止时间
+        'time_getFileStart': [],  # 6 文件获取开始时间(文件获取时间)
+        'time_getFileEnd': [],  # 7 文件获取截止时间
+        'time_registrationStart': [],  # 8 报名开始时间(报名时间)
+        'time_registrationEnd': [],  # 9 报名截止时间
+        'time_earnestMoneyStart': [], #10 保证金递交开始时间(保证金递交时间)
+        'time_earnestMoneyEnd': [] , # 11 保证金递交截止时间
+        'time_commencement':[] , #13 开工日期
+        'time_completion': []  # 14 竣工日期
+    }
+    for entity in list_entity:
+        if entity.label!=0:
+            entity_text = entity.entity_text
+            extract_time = my_timeFormat(entity_text)
+            if extract_time:
+                sentence_text = list_sentence[entity.sentence_index].sentence_text
+                entity_left = sentence_text[max(0,entity.wordOffset_begin-2):entity.wordOffset_begin]
+                entity_right = sentence_text[entity.wordOffset_end:entity.wordOffset_end+3]
+                label_prob = entity.values[entity.label]
+                if entity.label==1 and label_prob>0.5:
+                    dict_time['time_release'].append((extract_time[0],label_prob))
+                elif entity.label==2 and label_prob>0.5:
+                    dict_time['time_bidopen'].append((extract_time[0],label_prob))
+                elif entity.label==3 and label_prob>0.5:
+                    dict_time['time_bidclose'].append((extract_time[0],label_prob))
+                elif entity.label==12 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        if re.search("前|止|截止",entity_right) or re.search("至|止",entity_left) or re.search("前",entity_text[-2:]):
+                            dict_time['time_bidclose'].append((extract_time[0], label_prob))
+                        else:
+                            dict_time['time_bidstart'].append((extract_time[0], label_prob))
+                    else:
+                        dict_time['time_bidstart'].append((extract_time[0],label_prob))
+                        dict_time['time_bidclose'].append((extract_time[1],label_prob))
+                elif entity.label==4 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        if re.search("前|止|截止",entity_right) or re.search("至|止",entity_left) or re.search("前",entity_text[-2:]):
+                            dict_time['time_publicityEnd'].append((extract_time[0], label_prob))
+                        else:
+                            dict_time['time_publicityStart'].append((extract_time[0], label_prob))
+                    else:
+                        dict_time['time_publicityStart'].append((extract_time[0],label_prob))
+                        dict_time['time_publicityEnd'].append((extract_time[1],label_prob))
+                elif entity.label==5 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        dict_time['time_publicityEnd'].append((extract_time[0], label_prob))
+                    else:
+                        dict_time['time_publicityStart'].append((extract_time[0],label_prob))
+                        dict_time['time_publicityEnd'].append((extract_time[1],label_prob))
+                elif entity.label==6 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        if re.search("前|止|截止",entity_right) or re.search("至|止",entity_left) or re.search("前",entity_text[-2:]):
+                            dict_time['time_getFileEnd'].append((extract_time[0], label_prob))
+                        else:
+                            dict_time['time_getFileStart'].append((extract_time[0], label_prob))
+                    else:
+                        dict_time['time_getFileStart'].append((extract_time[0],label_prob))
+                        dict_time['time_getFileEnd'].append((extract_time[1],label_prob))
+                elif entity.label==7 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        dict_time['time_getFileEnd'].append((extract_time[0], label_prob))
+                    else:
+                        dict_time['time_getFileStart'].append((extract_time[0],label_prob))
+                        dict_time['time_getFileEnd'].append((extract_time[1],label_prob))
+                elif entity.label==8 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        if re.search("前|止|截止",entity_right) or re.search("至|止",entity_left) or re.search("前",entity_text[-2:]):
+                            dict_time['time_registrationEnd'].append((extract_time[0], label_prob))
+                        else:
+                            dict_time['time_registrationStart'].append((extract_time[0], label_prob))
+                    else:
+                        dict_time['time_registrationStart'].append((extract_time[0],label_prob))
+                        dict_time['time_registrationEnd'].append((extract_time[1],label_prob))
+                elif entity.label==9 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        dict_time['time_registrationEnd'].append((extract_time[0], label_prob))
+                    else:
+                        dict_time['time_registrationStart'].append((extract_time[0],label_prob))
+                        dict_time['time_registrationEnd'].append((extract_time[1],label_prob))
+                elif entity.label==10 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        if re.search("前|止|截止",entity_right) or re.search("至|止",entity_left) or re.search("前",entity_text[-2:]):
+                            dict_time['time_earnestMoneyEnd'].append((extract_time[0], label_prob))
+                        else:
+                            dict_time['time_earnestMoneyStart'].append((extract_time[0], label_prob))
+                    else:
+                        dict_time['time_earnestMoneyStart'].append((extract_time[0],label_prob))
+                        dict_time['time_earnestMoneyEnd'].append((extract_time[1],label_prob))
+                elif entity.label==11 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        dict_time['time_earnestMoneyEnd'].append((extract_time[0], label_prob))
+                    else:
+                        dict_time['time_earnestMoneyStart'].append((extract_time[0],label_prob))
+                        dict_time['time_earnestMoneyEnd'].append((extract_time[1],label_prob))
+                elif entity.label==13 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        if re.search("前|止|截止",entity_right) or re.search("至|止",entity_left) or re.search("前",entity_text[-2:]):
+                            dict_time['time_completion'].append((extract_time[0], label_prob))
+                        else:
+                            dict_time['time_commencement'].append((extract_time[0], label_prob))
+                    else:
+                        dict_time['time_commencement'].append((extract_time[0],label_prob))
+                        dict_time['time_completion'].append((extract_time[1],label_prob))
+                elif entity.label==14 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        dict_time['time_completion'].append((extract_time[0], label_prob))
+                    else:
+                        dict_time['time_commencement'].append((extract_time[0],label_prob))
+                        dict_time['time_completion'].append((extract_time[1],label_prob))
+
+
+    result_dict = dict((key,"") for key in dict_time.keys())
+    for time_type,value in dict_time.items():
+        list_time = dict_time[time_type]
+        if list_time:
+            list_time.sort(key=lambda x:x[1],reverse=True)
+            result_dict[time_type] = list_time[0][0]
+    return result_dict
+
 def getOtherAttributes(list_entity):
     dict_other = {"moneysource":"",
                   "person_review":[],
-                  "time_release":"",
-                  "time_bidopen":"",
-                  "time_bidclose":"",
+                  # "time_release":"",
+                  # "time_bidopen":"",
+                  # "time_bidclose":"",
                   "serviceTime":"",
                   "product":[],
                   "total_tendereeMoney":0,
                   "total_tendereeMoneyUnit":''}
-    dict_time = {
-        "time_release": [],
-        "time_bidopen": [],
-        "time_bidclose": []
-    }
+    # dict_time = {
+    #     "time_release": [],
+    #     "time_bidopen": [],
+    #     "time_bidclose": []
+    # }
     for entity in list_entity:
         if entity.entity_type == 'bidway':
             dict_other["bidway"] = turnBidWay(entity.entity_text)
@@ -2194,31 +2387,31 @@ def getOtherAttributes(list_entity):
             dict_other["moneysource"] = entity.entity_text
         elif entity.entity_type=='serviceTime':
             dict_other["serviceTime"] = entity.entity_text
-        elif entity.entity_type == 'time' and entity.label==1:
-            if entity.values[entity.label]>0.6:
-                dict_time['time_release'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
-            # dict_other["time_release"] = timeFormat(entity.entity_text)
-        elif entity.entity_type == 'time' and entity.label==2:
-            if entity.values[entity.label]>0.6:
-                dict_time['time_bidopen'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
-            # dict_other["time_bidopen"] = timeFormat(entity.entity_text)
-        elif entity.entity_type == 'time' and entity.label == 3:
-            if entity.values[entity.label]>0.6:
-                dict_time['time_bidclose'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
-            # dict_other["time_bidclose"] = timeFormat(entity.entity_text)
+        # elif entity.entity_type == 'time' and entity.label==1:
+        #     if entity.values[entity.label]>0.6:
+        #         dict_time['time_release'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
+        #     # dict_other["time_release"] = timeFormat(entity.entity_text)
+        # elif entity.entity_type == 'time' and entity.label==2:
+        #     if entity.values[entity.label]>0.6:
+        #         dict_time['time_bidopen'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
+        #     # dict_other["time_bidopen"] = timeFormat(entity.entity_text)
+        # elif entity.entity_type == 'time' and entity.label == 3:
+        #     if entity.values[entity.label]>0.6:
+        #         dict_time['time_bidclose'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
+        #     # dict_other["time_bidclose"] = timeFormat(entity.entity_text)
         elif entity.entity_type=="person" and entity.label ==4:
             dict_other["person_review"].append(entity.entity_text)
         elif entity.entity_type=='product':
             dict_other["product"].append(entity.entity_text)
         elif entity.entity_type=='money' and entity.notes=='总投资' and dict_other["total_tendereeMoney"]<float(entity.entity_text):
-            dict_other["total_tendereeMoney"] = float(entity.entity_text)
-            dict_other["total_tendereeMoneyUnit"] = entity.money_unit
+                dict_other["total_tendereeMoney"] = float(entity.entity_text)
+                dict_other["total_tendereeMoneyUnit"] = entity.money_unit
     # 时间类别
-    for time_type,value in dict_time.items():
-        list_time = dict_time[time_type]
-        if list_time:
-            list_time.sort(key=lambda x:x[1],reverse=True)
-            dict_other[time_type] = list_time[0][0]
+    # for time_type,value in dict_time.items():
+    #     list_time = dict_time[time_type]
+    #     if list_time:
+    #         list_time.sort(key=lambda x:x[1],reverse=True)
+    #         dict_other[time_type] = list_time[0][0]
     dict_other["product"] = list(set(dict_other["product"]))
     return dict_other
 
@@ -2235,7 +2428,7 @@ def getPREMs(list_sentences,list_entitys,list_articles):
     result = []
     for list_sentence,list_entity,list_article in zip(list_sentences,list_entitys,list_articles):
         RoleList = getPackageRoleMoney(list_sentence,list_entity)
-        result.append(dict({"prem":RoleList,"docid":list_article.id},**getOtherAttributes(list_entity),
+        result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity),**getTimeAttributes(list_entity,list_sentence),
                            **{"fingerprint":list_article.fingerprint,"match_enterprise":list_article.match_enterprise,
                               "match_enterprise_type":list_article.match_enterprise_type,"process_time":getCurrent_date(),
                               "attachmentTypes":list_article.attachmentTypes, "bidway": list_article.bidway}))

+ 65 - 14
BiddingKG/dl/interface/predictor.py

@@ -35,7 +35,8 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
                   "punish":{"predictor":None,"Lock":RLock()},
                   "product":{"predictor":None,"Lock":RLock()},
                 "product_attrs":{"predictor":None,"Lock":RLock()},
-                  "channel": {"predictor": None, "Lock": RLock()}}
+                  "channel": {"predictor": None, "Lock": RLock()},
+                  "deposit_payment_way": {"predictor": None, "Lock": RLock()}}
 
 
 def getPredictor(_type):
@@ -62,6 +63,8 @@ def getPredictor(_type):
                     dict_predictor[_type]["predictor"] = ProductAttributesPredictor()
                 if _type == "channel":
                     dict_predictor[_type]["predictor"] = DocChannel()
+                if _type == 'deposit_payment_way':
+                    dict_predictor[_type]["predictor"] = DepositPaymentWay()
             return dict_predictor[_type]["predictor"]
     raise NameError("no this type of predictor")
 
@@ -542,6 +545,7 @@ class PREMPredict():
             list_entitys:文章的entitys
         @return:角色模型的输入数据
         '''
+        text_list = []
         data_x = []
         points_entitys = []
         for list_entity,list_sentence in zip(list_entitys,list_sentences):
@@ -556,6 +560,7 @@ class PREMPredict():
                     while(p_sentences<len(list_sentence)):
                         sentence = list_sentence[p_sentences]
                         if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
+                            text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin-10):entity.wordOffset_end+10])
                             #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_ROLE_INPUT_SHAPE[1]),shape=settings.MODEL_ROLE_INPUT_SHAPE)
                             item_x = self.model_role.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,entity_text=entity.entity_text)
                             data_x.append(item_x)
@@ -568,7 +573,7 @@ class PREMPredict():
         if len(points_entitys)==0:
             return None
         
-        return [data_x,points_entitys]
+        return [data_x,points_entitys, text_list]
     
     
     def search_money_data(self,list_sentences,list_entitys):
@@ -579,6 +584,7 @@ class PREMPredict():
             list_entitys:文章的entitys
         @return:金额模型的输入数据
         '''
+        text_list = []
         data_x = []
         points_entitys = []
         for list_entity,list_sentence in zip(list_entitys,list_sentences):
@@ -594,6 +600,7 @@ class PREMPredict():
                     while(p_sentences<len(list_sentence)):
                         sentence = list_sentence[p_sentences]
                         if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
+                            text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin - 8):entity.wordOffset_end])
                             #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_MONEY_INPUT_SHAPE[1]),shape=settings.MODEL_MONEY_INPUT_SHAPE)
                             #item_x = embedding_word(spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index, end_index=entity.end_index, size=10, center_include=True, word_flag=True),shape=settings.MODEL_MONEY_INPUT_SHAPE)
                             item_x = self.model_money.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
@@ -606,7 +613,7 @@ class PREMPredict():
         if len(points_entitys)==0:
             return None
         
-        return [data_x,points_entitys]
+        return [data_x,points_entitys, text_list]
     
     def predict_role(self,list_sentences, list_entitys):
         datas = self.search_role_data(list_sentences, list_entitys)
@@ -614,6 +621,7 @@ class PREMPredict():
         if datas is None:
             return
         points_entitys = datas[1]
+        text_list = datas[2]
 
 
         if USE_PAI_EAS:
@@ -641,17 +649,24 @@ class PREMPredict():
         for i in range(len(predict_y)):
             entity = points_entitys[i]
             label = np.argmax(predict_y[i])
-            values = []
-            for item in predict_y[i]:
-                values.append(item)
-                entity.set_Role(label,values)
-        
+            values = predict_y[i]
+            text = text_list[i]
+            if label == 2:
+                if re.search('中标单位和.{,25}签订合同', text):
+                    label = 0
+                    values[label] = 0.501
+                elif re.search('尊敬的供应商:.{,25}我公司', text):
+                    label = 0
+                    values[label] = 0.801
+            entity.set_Role(label, values)
+
     def predict_money(self,list_sentences,list_entitys):
         datas = self.search_money_data(list_sentences, list_entitys)
         if datas is None:
             return
         points_entitys = datas[1]
         _data = datas[0]
+        text_list = datas[2]
         if USE_PAI_EAS:
             _data = np.transpose(np.array(_data),(1,0,2,3))
             request = tf_predict_pb2.PredictRequest()
@@ -677,7 +692,10 @@ class PREMPredict():
             entity = points_entitys[i]
             label = np.argmax(predict_y[i])
             values = predict_y[i]
-            if label ==0 and entity.notes=="投资":
+            text = text_list[i]
+            if label == 1 and re.search('[::,。](总金额|总价|单价)', text):
+                values[label] = 0.49
+            elif label ==0 and entity.notes in ["投资", "工程造价"]:
                 values[label] = 0.49
             entity.set_Money(label, values)
         
@@ -1065,17 +1083,17 @@ class FormPredictor():
 class RoleRulePredictor():
     
     def __init__(self):
-        self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|采购|招标|项目|竞价|议价|需求|最终|建设|转让|招租|甲|议标|合同主体|比选)(?:人|公司|单位|组织|用户|业主|方|部门)|文章来源|业主名称|需方|询价单位)(是|为|信息|:|:|\s*)$)"
+        self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|采购|招标|项目|竞价|议价|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|比选|委托|询价)(?:人|公司|单位|组织|用户|业主|方|部门)|文章来源|需方)(名称)?(是|为|信息|:|:|\s*)$)"
         self.pattern_tenderee_center = "(?P<tenderee_center>(受.{,20}委托))"
         self.pattern_tenderee_right = "(?P<tenderee_right>^(\((以下简称)?[\"”]?(招标|采购)(人|单位|机构)\)?))"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
         
         self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|集采机构|招标机构)(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{,20}委托))"
         self.pattern_agency_right = "(?P<agency_right>^(\((以下简称)?[\"”]?(代理)(人|单位|机构)\))|受.{,15}委托)"
         # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
-        self.pattern_winTenderer_left = "(?P<winTenderer_left>((中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$|(选定单位|指定的中介服务机构))[::是为,]+$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))[::是为]+$|((评审结果|名次|排名)[::]第?[一1]名?)$|单一来源(采购)?方式向$|((中标|成交)(结果|信息))(是|为|:|:)$|(单一来源采购(供应商|供货商|服务商))$|[^候选]((分包|标包){,5}供应商|供货商|服务商|供应商名称|服务机构|供方)[::]$)"
+        self.pattern_winTenderer_left = "(?P<winTenderer_left>((中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|各?供应商|方|公司|厂商|商)[::是为]+$|(选定单位|指定的中介服务机构))[::是为,]+$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))[::是为]+$|((评审结果|名次|排名)[::]第?[一1]名?)$|单一来源(采购)?方式向$|((中标|成交)(结果|信息))(是|为|:|:)$|(单一来源采购(供应商|供货商|服务商))$|[^候选]((分包|标包){,5}供应商|供货商|服务商|供应商名称|服务机构|供方)[::]$)"
         # self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
-        self.pattern_winTenderer_right = "(?P<winTenderer_right>^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))"
-        self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|谈判结果:由.{5,20}供货)"   # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
+        self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
+        self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|谈判结果:由.{5,20}供货)|中标通知书.{,15}你方"   # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
 
         # self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)"
 
@@ -1193,7 +1211,7 @@ class RoleRulePredictor():
                                                     _role = _group.split("_")[0]
                                                     _direct = _group.split("_")[1]
                                                     _label = {"tenderee":0,"agency":1,"winTenderer":2,"secondTenderer":3,"thirdTenderer":4}.get(_role)
-                                                    if _i_span==0 and _direct=="left":
+                                                    if _i_span==0 and _direct=="left" and '各供应商' not in _v_group: #2021/12/22 修正错误中标召回 例子208668937
                                                         _flag = True
                                                         _distance = abs((len(list_spans[_i_span])-_iter.span()[1]))
                                                         list_distance[int(_label)] = min(_distance,list_distance[int(_label)])
@@ -2186,6 +2204,39 @@ class DocChannel():
       # return self.id2type[id], prob
       return [{'docchannel':self.id2type[id]}]
 
+# 保证金支付方式提取
+class DepositPaymentWay():
+    def __init__(self,):
+        self.pt = '(保证金的?(交纳|缴纳|应按下列|入账|支付)方式)[::]*([^,。]{,60})'
+        self.pt2 = '保证金(必?须以|必?须?通过|以)(.{,8})方式'
+        kws = ['银行转账', '公?对公方?式?转账', '对公转账', '柜台转账', '(线上|网上)自?行?(缴纳|交纳|缴退|收退)',
+               '网上银行支付', '现金存入', '直接缴纳', '支票', '汇票', '本票', '电汇', '转账', '汇款', '随机码',
+               '入账', '基本账户转出', '基本账户汇入', '诚信库中登记的账户转出',
+               '银行保函', '电子保函', '担保函', '保证保险', '合法担保机构出具的担保', '金融机构、担保机构出具的保函']
+        self.kws = sorted(kws, key=lambda x: len(x), reverse=True)
+
+    def predict(self,content):
+        pay_way = {'deposit_patment_way':''}
+        result = []
+        pay = re.search(self.pt, content)
+        if pay:
+            # print(pay.group(0))
+            pay = pay.group(3)
+            for it in re.finditer('|'.join(self.kws), pay):
+                result.append(it.group(0))
+            pay_way['deposit_patment_way'] = ';'.join(result)
+            return pay_way
+        pay = re.search(self.pt2, content)
+        if pay:
+            # print(pay.group(0))
+            pay = pay.group(2)
+            for it in re.finditer('|'.join(self.kws), pay):
+                result.append(it.group(0))
+            pay_way['deposit_patment_way'] = ';'.join(result)
+            return pay_way
+        else:
+            return pay_way
+
 def getSavedModel():
     #predictor = FormPredictor()
     graph = tf.Graph()

BIN
BiddingKG/dl/interface/timesplit_model/saved_model.pb


BIN
BiddingKG/dl/interface/timesplit_model/variables/variables.data-00000-of-00001


BIN
BiddingKG/dl/interface/timesplit_model/variables/variables.index


+ 133 - 0
BiddingKG/dl/money/re_money_total_unit.py

@@ -0,0 +1,133 @@
+import json
+import pandas as pd
+import re
+from bs4 import BeautifulSoup
+
+# 总价
+total_money = '(合计.?金额|合.?计|总.?价)'
+# 单价
+unit_money = '(单价|([0-9.,,]+([((]?元[))]?)?/))'
+
+
+def re_standard_total(_str):
+    reg_standard = "(?P<value>" + total_money + ")"
+    match = re.finditer(reg_standard, _str)
+    total_money_list = []
+    if match:
+        for m in match:
+            m_dict = m.groupdict()
+            m_span = m.span()
+            keyword_index = [m_span[0], m_span[1]]
+            keyword = m_dict.get("value")
+            # total_money_list.append([keyword, keyword_index])
+            total_money_list.append([keyword, keyword_index, _str])
+
+    return total_money_list
+
+
+def re_standard_unit(_str):
+    reg_standard = "(?P<value>" + unit_money + ")"
+    match = re.finditer(reg_standard, _str)
+    unit_money_list = []
+    if match:
+        for m in match:
+            m_dict = m.groupdict()
+            m_span = m.span()
+            keyword_index = [m_span[0], m_span[1]]
+            keyword = m_dict.get("value")
+            # unit_money_list.append([keyword, keyword_index])
+
+            # 上下文有招标文件的不算
+            if '文件' not in _str:
+                unit_money_list.append([keyword, keyword_index, _str])
+
+    return unit_money_list
+
+
+def re_total(text, money, index):
+    # 对已提取的中投标金额的前面文字进行正则
+    prefix_threshold = 10
+    suffix_threshold = 10
+    # if index_threshold < index[0]:
+    #     money_text = text[index[0]-index_threshold:index[0]]
+    #     print("total", money, text[index[0]-index_threshold:index[1]], money_text)
+    # else:
+    #     money_text = text[:index[0]]
+    #     print("total", money, text[:index[1]], money_text)
+
+    prefix_index = index[0] - prefix_threshold
+    suffix_index = index[1] + suffix_threshold
+    money_text = text[prefix_index if prefix_index > 0 else 0:
+                      suffix_index if suffix_index < len(text) else len(text)]
+
+    # 查找符合标准形式的 总价
+    total_money_list = re_standard_total(money_text)
+    return total_money_list
+
+
+def re_unit(text, money, index):
+    # 对已提取的中投标金额的前面文字进行正则
+    prefix_threshold = 10
+    suffix_threshold = 10
+    # if prefix_threshold < index[0]:
+    #     money_text = text[index[0]-prefix_threshold:index[0]]
+    #     print("unit", money, text[index[0]-prefix_threshold:index[1]], money_text)
+    # else:
+    #     money_text = text[:index[0]]
+    #     print("unit", money, text[:index[1]], money_text)
+
+    prefix_index = index[0] - prefix_threshold
+    suffix_index = index[1] + suffix_threshold
+    money_text = text[prefix_index if prefix_index > 0 else 0:
+                      suffix_index if suffix_index < len(text) else len(text)]
+
+    # 查找符合标准形式的 单价
+    unit_money_list = re_standard_unit(money_text)
+    return unit_money_list
+
+
+def extract_total_money(text, money, index):
+    result_list = []
+    total_money_list = re_total(text, money, index)
+    if total_money_list:
+        for word, text_index, context in total_money_list:
+            d = {"body": word, "begin_index": text_index[0],
+                 "end_index": text_index[1], "context": context}
+            result_list.append(d)
+    return result_list
+
+
+def extract_unit_money(text, money, index):
+    result_list = []
+    unit_money_list = re_unit(text, money, index)
+    if unit_money_list:
+        for word, text_index, context in unit_money_list:
+            d = {"body": word, "begin_index": text_index[0],
+                 "end_index": text_index[1], "context": context}
+            result_list.append(d)
+    return result_list
+
+
+def test_str():
+    s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
+    s = '往往,20(元)/平方'
+    print(extract_unit_money(s, "785.0", [6, 11]))
+
+
+def test_html():
+    html_path = "C:/Users/Administrator/Desktop/3.html"
+
+    with open(html_path, "r") as f:
+        s = f.read()
+
+    print(extract_total_money(s))
+
+
+if __name__ == "__main__":
+    # extract_bidway(s)
+
+    path = "D:\\BIDI_DOC\\比地_文档\\总价单价_result.csv"
+    test_str()
+    # test_html(path)
+    pass
+

+ 75 - 0
BiddingKG/dl/money/test_re_money_total_unit.py

@@ -0,0 +1,75 @@
+import json
+import re
+import sys, os
+import time
+
+import pandas as pd
+from bs4 import BeautifulSoup
+sys.path.append(os.path.abspath("../.."))
+from BiddingKG.dl.interface.extract import predict
+
+
+def bidi_predict(html_str):
+    content = html_str
+    # content = "<div>总价:1110</div>"
+    result_dict = json.loads(predict("1", content))
+    return result_dict
+
+
+def test_csv(_path):
+    start_time = time.time()
+    df = pd.read_csv(_path)
+
+    # total money
+    predict_list_1 = []
+    predict_list_2 = []
+    for index, row in df.iterrows():
+        # if index >= 1000:
+        #     break
+
+        if index % 50 == 0:
+            print("="*30, "Loop", index, "="*30)
+
+        html_str = row["dochtmlcon"]
+        # html_str = df.loc[75, "dochtmlcon"]
+        # print(html_str)
+
+        # 先筛选
+        # possible = '((合计.?金额|合.?计|总.?价|单.?价)((元))?([:: ]))' \
+        #            '|([0-9.,,]+([((]?元[))]?)?/)'
+        # if not re.search(possible, html_str):
+        #     predict_list_1.append(str([]))
+        #     predict_list_2.append(str([]))
+        #     continue
+
+        # 先经过模型处理
+        result_dict = bidi_predict(html_str)
+
+        # 获取总价单价
+        word_list_1 = result_dict.get("total_money")
+        word_list_2 = result_dict.get("unit_money")
+
+        if word_list_1:
+            predict = word_list_1
+        else:
+            predict = []
+        print("predict total money", predict)
+        predict_list_1.append(str(predict))
+
+        if word_list_2:
+            predict = word_list_2
+        else:
+            predict = []
+        print("predict unit money", predict)
+        predict_list_2.append(str(predict))
+
+    predict_df_1 = pd.DataFrame(predict_list_1)
+    predict_df_2 = pd.DataFrame(predict_list_2)
+    df = pd.concat([df, predict_df_1, predict_df_2], axis=1)
+    df.to_csv(_path)
+    print("finish write!", time.time()-start_time)
+
+
+if __name__ == "__main__":
+    path = "D:\\BIDI_DOC\\比地_文档\\总价单价_result.csv"
+    test_csv(path)

+ 28 - 0
BiddingKG/dl/offer_type/re_offer_type.py

@@ -0,0 +1,28 @@
+import pandas as pd
+import re
+
+# 报价类型为总价报价
+# 报价类型: 闭口价
+# 报价类型:国内含税价/人民币
+# 报价类型:国内含税价;人民币
+# 报价类型: 浮动价
+# 报价类型 含税含运费
+# 报价类型 单个商品报价
+# 报价类型:单个标的报单价
+# 报价类型:多个标的报总价,
+# 报价类型:不含税(到厂)
+# 报价类型: 金额
+# 报价类型 含税含运费
+# 报价类型:单个标的报单价
+
+
+
+
+
+
+
+
+
+# 报价类型:
+
+

+ 68 - 0
BiddingKG/dl/ratio/re_ratio.py

@@ -0,0 +1,68 @@
+import re
+
+ratio = '([((]?(上浮|下浮)(率|)(报价|)([((]?%[))]?|)[))]?[:: ,]{0,3}[0-9]+.?[0-9]*[((]?%?[))]?)'
+
+# 基准利率上浮率):大写:百分之叁拾点零零,小写:30.00%,
+# 基准利率上浮率:百分之三十(30%)
+# 租金上浮率
+# 上浮率活期20%
+# 上浮率:活期20%、一年定期35%
+# 下浮率报价0.5%
+
+
+def re_standard_ratio(_str):
+    reg_standard = "(?P<value>" + ratio + ")"
+    match = re.finditer(reg_standard, _str)
+    ratio_list = []
+    if match:
+        for m in match:
+            m_dict = m.groupdict()
+            m_span = m.span()
+            keyword_index = [m_span[0], m_span[1]]
+            keyword = m_dict.get("value")
+            ratio_list.append([keyword, keyword_index])
+
+    return ratio_list
+
+
+def re_ratio(text):
+    # 查找符合标准形式的 总价
+    ratio_list = re_standard_ratio(text)
+    return ratio_list
+
+
+def extract_ratio(text):
+    result_list = []
+    total_money_list = re_ratio(text)
+    if total_money_list:
+        for word, text_index in total_money_list:
+            d = {"body": word, "begin_index": text_index[0],
+                 "end_index": text_index[1]}
+            result_list.append(d)
+    return result_list
+
+
+def test_str():
+    s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
+    s = '年利率较基准利率的上浮率(%): 30 活期存款下浮率:0.455% 协定存的下浮率,(1-下浮率)' \
+        ' 上浮率....  上浮率30(%)  (下浮率%):43  下浮率报价0.5%'
+    print(extract_ratio(s))
+
+
+def test_html():
+    html_path = "C:/Users/Administrator/Desktop/3.html"
+
+    with open(html_path, "r") as f:
+        s = f.read()
+
+    print(extract_ratio(s))
+
+
+if __name__ == "__main__":
+    # extract_bidway(s)
+
+    # path = "D:\\BIDI_DOC\\比地_文档\\比率_result.csv"
+    test_str()
+    # test_html(path)
+    pass
+

+ 62 - 0
BiddingKG/dl/ratio/test_re_ratio.py

@@ -0,0 +1,62 @@
+import json
+import sys, os
+import time
+import pandas as pd
+sys.path.append(os.path.abspath("../../.."))
+print("sys.path[-1]", sys.path[-1])
+from BiddingKG.dl.interface.extract import predict
+
+
+def bidi_predict(html_str):
+    content = html_str
+    result_dict = json.loads(predict("1", content))
+    return result_dict
+
+
+def test_csv(_path):
+    start_time = time.time()
+    df = pd.read_csv(_path)
+
+    # ratio, total_money, unit_money
+    predict_list_1 = []
+    predict_list_2 = []
+    predict_list_3 = []
+    for index, row in df.iterrows():
+        # if index >= 1000:
+        #     break
+
+        if index % 50 == 0:
+            print("="*30, "Loop", index, time.time()-start_time, "="*30)
+
+        html_str = row["dochtmlcon"]
+
+        # 先经过模型处理
+        result_dict = bidi_predict(html_str)
+
+        # 获取比率总价单价
+        word_list_1 = result_dict.get("total_money")
+        word_list_2 = result_dict.get("unit_money")
+        word_list_3 = result_dict.get("ratio")
+
+        # print("predict ratio", word_list_3)
+        predict_list_3.append(str(word_list_3))
+
+        # print("predict total money", word_list_1)
+        predict_list_1.append(str(word_list_1))
+
+        # print("predict unit money", word_list_2)
+        predict_list_2.append(str(word_list_2))
+
+    predict_df_1 = pd.DataFrame(predict_list_1)
+    predict_df_2 = pd.DataFrame(predict_list_2)
+    predict_df_3 = pd.DataFrame(predict_list_3)
+    df = pd.concat([df, predict_df_3, predict_df_1, predict_df_2], axis=1)
+    df.to_csv(_path)
+    print("finish write!", time.time()-start_time)
+
+
+if __name__ == "__main__":
+    # path = "D:\\BIDI_DOC\\比地_文档\\比率_result.csv"
+    path = '比率_result.csv'
+    # path = '总价单价_result.csv'
+    test_csv(path)

+ 4 - 2
BiddingKG/dl/test/测试整个要素提取流程.py

@@ -147,6 +147,8 @@ def predict(doc_id,text):
                     # print(_sentence.sentence_text,_sentence.tokens[entity.begin_index:entity.end_index+1])
                     pass
                 # print(entity.pointer_pack)
+            # elif entity.entity_type =='serviceTime':
+            #     print(entity.entity_text)
             #     if entity.pointer_pack:
             #         print('pointer_pack_name:',entity.pointer_pack.entity_text)
             # elif entity.entity_type in ['package']:
@@ -439,8 +441,8 @@ if __name__=="__main__":
     a = time.time()
     print("start")
     # print(predict("12",content))
-    # result = predict("12",text)
-    result = predict("12",content)
+    result = predict("12",text)
+    # result = predict("12",content)
     # print(json.loads(result))
     #test("12",text)
     print("takes",time.time()-a)

BIN
BiddingKG/dl/time/model_time_classify.weights


Diff do ficheiro suprimidas por serem muito extensas
+ 1 - 7
BiddingKG/dl/time/re_servicetime.py


+ 367 - 16
BiddingKG/dl/time/train_2.py

@@ -13,10 +13,32 @@ from BiddingKG.dl.common.models import *
 from sklearn.metrics import classification_report
 from sklearn.utils import shuffle,class_weight
 import matplotlib.pyplot as plt
+import random
 
 input_shape = (2,30,60)
 input_shape2 = (2,40,128)
-output_shape = [4]
+# output_shape = [4]
+
+time_label_dict = {
+             'time': 0,
+            'time_release': 1, #发布时间
+            'time_bidopen': 2, #开标时间
+            'time_bidclose': 3, #截标时间
+            'time_bidstart': 12, #投标(开始)时间、响应文件接收(开始)时间
+
+            'time_publicityStart': 4, #公示开始时间(公示时间、公示期)
+            'time_publicityEnd': 5, #公示截止时间
+            'time_getFileStart': 6, #文件获取开始时间(文件获取时间)
+            'time_getFileEnd': 7, #文件获取截止时间
+            'time_registrationStart': 8, #报名开始时间(报名时间)
+            'time_registrationEnd': 9, #报名截止时间
+            'time_earnestMoneyStart': 10, #保证金递交开始时间(保证金递交时间)
+            'time_earnestMoneyEnd': 11, #保证金递交截止时间
+            'time_commencement': 13, #开工日期
+            'time_completion': 14 #竣工日期
+        }
+output_shape = [len(time_label_dict)]
+
 
 def get_data():
     data_load = pd.read_csv("newdata_30_prc.csv", index_col=0)
@@ -91,16 +113,23 @@ def getModel2():
     R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
     R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
 
-    L_input_drop = Dropout(0.2)(L_input)
-    R_input_drop = Dropout(0.2)(R_input)
+    L_input_drop = Dropout(0.3)(L_input)
+    R_input_drop = Dropout(0.3)(R_input)
     # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
     L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
     L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
     # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
     R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
     R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
+    L_R = layers.merge([L_lstm, R_lstm],concat_axis=1, mode='concat')
+    L_R_mask = layers.merge([L_mask, R_mask],concat_axis=1, mode='concat')
+    L_R_att = Attention02()(L_R,mask=K.squeeze(L_R_mask,axis=-1))
+
+    L_att = layers.add([L_att,L_R_att])
+    R_att = layers.add([R_att,L_R_att])
     concat = layers.merge([L_att, R_att], mode='concat')
-    concat = Dropout(0.3)(concat)
+
+    concat = Dropout(0.2)(concat)
     output = layers.Dense(output_shape[0],activation="softmax")(concat)
 
     model = models.Model(inputs=[L_input,R_input], outputs=output)
@@ -111,6 +140,36 @@ def getModel2():
                   metrics=[precision,recall,f1_score])
     model.summary()
     return model
+# def getModel2():
+#     '''
+#     @summary: 时间分类模型
+#     '''
+#     L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
+#     L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input)
+#     R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
+#     R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
+#
+#     L_input_drop = Dropout(0.3)(L_input)
+#     R_input_drop = Dropout(0.3)(R_input)
+#     # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
+#     L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
+#     L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
+#     # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
+#     R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
+#     R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
+#     concat = layers.merge([L_att, R_att], mode='concat')
+#
+#     concat = Dropout(0.2)(concat)
+#     output = layers.Dense(output_shape[0],activation="softmax")(concat)
+#
+#     model = models.Model(inputs=[L_input,R_input], outputs=output)
+#
+#     learn_rate = 0.00005
+#     model.compile(optimizer=optimizers.Adam(lr=learn_rate),
+#                   loss=losses.binary_crossentropy,
+#                   metrics=[precision,recall,f1_score])
+#     model.summary()
+#     return model
 
 def getModel3():
     '''
@@ -121,8 +180,8 @@ def getModel3():
     R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
     R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
 
-    L_input_drop = Dropout(0.2)(L_input)
-    R_input_drop = Dropout(0.2)(R_input)
+    L_input_drop = Dropout(0.3)(L_input)
+    R_input_drop = Dropout(0.3)(R_input)
     # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
     L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
     # L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
@@ -133,7 +192,7 @@ def getModel3():
     att = Attention02()(concat,mask=K.squeeze(concat_mask,axis=-1))
     # R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
     # concat = layers.merge([L_att, R_att], mode='concat')
-    att = Dropout(0.3)(att)
+    att = Dropout(0.2)(att)
     output = layers.Dense(output_shape[0],activation="softmax")(att)
 
     model = models.Model(inputs=[L_input,R_input], outputs=output)
@@ -145,6 +204,72 @@ def getModel3():
     model.summary()
     return model
 
+class Attention(Layer):
+    """多头注意力机制
+    """
+    def __init__(self, nb_head, size_per_head, **kwargs):
+        self.nb_head = nb_head
+        self.size_per_head = size_per_head
+        self.out_dim = nb_head * size_per_head
+        super(Attention, self).__init__(**kwargs)
+    def build(self, input_shape):
+        super(Attention, self).build(input_shape)
+        q_in_dim = input_shape[0][-1]
+        k_in_dim = input_shape[1][-1]
+        v_in_dim = input_shape[2][-1]
+        self.q_kernel = self.add_weight(name='q_kernel',
+                                        shape=(q_in_dim, self.out_dim),
+                                        initializer='glorot_normal')
+        self.k_kernel = self.add_weight(name='k_kernel',
+                                        shape=(k_in_dim, self.out_dim),
+                                        initializer='glorot_normal')
+        self.v_kernel = self.add_weight(name='w_kernel',
+                                        shape=(v_in_dim, self.out_dim),
+                                        initializer='glorot_normal')
+    def mask(self, x, mask, mode='mul'):
+        if mask is None:
+            return x
+        else:
+            for _ in range(K.ndim(x) - K.ndim(mask)):
+                mask = K.expand_dims(mask, K.ndim(mask))
+            if mode == 'mul':
+                return x * mask
+            else:
+                return x - (1 - mask) * 1e10
+    def call(self, inputs):
+        q, k, v = inputs[:3]
+        v_mask, q_mask = None, None
+        if len(inputs) > 3:
+            v_mask = inputs[3]
+            if len(inputs) > 4:
+                q_mask = inputs[4]
+        # 线性变换
+        qw = K.dot(q, self.q_kernel)
+        kw = K.dot(k, self.k_kernel)
+        vw = K.dot(v, self.v_kernel)
+        # 形状变换
+        qw = K.reshape(qw, (-1, K.shape(qw)[1], self.nb_head, self.size_per_head))
+        kw = K.reshape(kw, (-1, K.shape(kw)[1], self.nb_head, self.size_per_head))
+        vw = K.reshape(vw, (-1, K.shape(vw)[1], self.nb_head, self.size_per_head))
+        # 维度置换
+        qw = K.permute_dimensions(qw, (0, 2, 1, 3))
+        kw = K.permute_dimensions(kw, (0, 2, 1, 3))
+        vw = K.permute_dimensions(vw, (0, 2, 1, 3))
+        # Attention
+        a = K.batch_dot(qw, kw, [3, 3]) / self.size_per_head**0.5
+        a = K.permute_dimensions(a, (0, 3, 2, 1))
+        a = self.mask(a, v_mask, 'add')
+        a = K.permute_dimensions(a, (0, 3, 2, 1))
+        a = K.softmax(a)
+        # 完成输出
+        o = K.batch_dot(a, vw, [3, 2])
+        o = K.permute_dimensions(o, (0, 2, 1, 3))
+        o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
+        o = self.mask(o, q_mask, 'mul')
+        return o
+    def compute_output_shape(self, input_shape):
+        return (input_shape[0][0], input_shape[0][1], self.out_dim)
+
 class Attention02(Layer):
     def __init__(self, **kwargs):
         self.init = initializers.get('normal')
@@ -530,11 +655,216 @@ def train3():
     # # y_pre2 = load_model.predict(train_x[0])
     # res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
     # print(res2)
+
+def train4():
+    # data_load = pd.read_excel("tokens_tolabel_data1.xlsx", index_col=0)
+    data_load = pd.read_excel("tokens_tolabel_data1_res13New.xlsx", index_col=0)
+    # data_load = pd.concat([data_load[data_load['re_label']==0],data_load])
+    # data_load = data_load[data_load['pre_label_prob']>0.97]
+    # data_load = data_load[data_load['is_same']==1]
+    data_zero = pd.read_excel("time_entity5.xlsx")
+    data_zero = data_zero[(data_zero['viewed']==1)|(data_zero['is_same']==2)]
+    # data_old = pd.read_excel("tokens_data_02.xlsx")
+    data_old = pd.read_excel("tokens_data_02_res7New.xlsx")
+    data_delay1 = pd.read_excel("delayTime_entity1.xlsx")
+    data_delay1 = data_delay1[data_delay1['label']!=0]
+    data_delay2 = pd.read_excel("delayTime_entity2.xlsx")
+
+    # data_zero = pd.concat([data_zero,data_zero])
+    # data_zero = pd.concat([data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)],data_zero.sample(n=3000)])
+    # data_zero = data_zero.sample(n=80000)
+    print("输入shape:",input_shape2)
+    data_x = []
+    data_y = []
+    import random
+    for left, right, label,_label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label'], data_load['label']):
+        # if label==_label:
+
+        y = np.zeros(output_shape)
+        y[label] = 1
+        left = eval(left)
+        left = left[-40:]
+        right = eval(right)
+        right = right[:40]
+        context = [left, right]
+        # x = embedding(context, shape=input_shape2)
+        data_x.append(context)
+        data_y.append(y)
+    # data_load2 = data_load[data_load['re_label']==0]
+    # for left, right, label,_label in zip(data_load2['context_left'], data_load2['context_right'], data_load2['re_label'], data_load2['label']):
+    #         if label==_label:
+    #             y = np.zeros(output_shape)
+    #             y[label] = 1
+    #             left = eval(left)
+    #             left = left[-40:]
+    #             if len(left)>30:
+    #                 left = left[2:]
+    #             elif len(left)>15:
+    #                 left = left[1:]
+    #             right = eval(right)
+    #             right = right[:40]
+    #             if len(right)>15:
+    #                 right = right[:-1]
+    #             context = [left, right]
+    #             # x = embedding(context, shape=input_shape2)
+    #             data_x.append(context)
+    #             data_y.append(y)
+
+    for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['re_label']):
+
+        y = np.zeros(output_shape)
+        y[label] = 1
+        left = eval(left)
+        left = left[-40:]
+        right = eval(right)
+        right = right[:40]
+        context = [left, right]
+        # x = embedding(context, shape=input_shape2)
+        data_x.append(context)
+        data_y.append(y)
+
+    for left, right, label in zip(data_delay1['context_left'], data_delay1['context_right'], data_delay1['label']):
+            y = np.zeros(output_shape)
+            y[label] = 1
+            left = eval(left)
+            left = left[-40:]
+            right = eval(right)
+            right = right[:40]
+            context = [left, right]
+            # x = embedding(context, shape=input_shape2)
+            data_x.append(context)
+            data_y.append(y)
+    for left, right, label in zip(data_delay2['context_left'], data_delay2['context_right'], data_delay2['re_label']):
+                y = np.zeros(output_shape)
+                y[label] = 1
+                left = eval(left)
+                left = left[-40:]
+                right = eval(right)
+                right = right[:40]
+                context = [left, right]
+                # x = embedding(context, shape=input_shape2)
+                data_x.append(context)
+                data_y.append(y)
+
+    # for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']):
+    #         y = np.zeros(output_shape)
+    #         y[label] = 1
+    #         left = eval(left)
+    #         left = left[-40:]
+    #         if len(left) > 30:
+    #             left = left[2:]
+    #         elif len(left) > 15:
+    #             left = left[1:]
+    #         right = eval(right)
+    #         right = right[:40]
+    #         if len(right) > 15:
+    #             right = right[:-1]
+    #         context = [left, right]
+    #         # x = embedding(context, shape=input_shape2)
+    #         data_x.append(context)
+    #         data_y.append(y)
+
+    # for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']):
+    #         y = np.zeros(output_shape)
+    #         y[label] = 1
+    #         left = eval(left)
+    #         left = left[-40:]
+    #         right = eval(right)
+    #         right = right[:40]
+    #         context = [left, right]
+    #         # x = embedding(context, shape=input_shape2)
+    #         data_x.append(context)
+    #         data_y.append(y)
+    for left, right, label,pre_label,is_same in zip(data_old['context_left'], data_old['context_right'], data_old['label'],
+                                                    data_old['pre_label'],data_old['is_same']):
+        if label==0:
+            if is_same==1:
+                pass
+            else:
+                if pre_label>3:
+                    label = pre_label
+                else:
+                    continue
+        y = np.zeros(output_shape)
+        y[label] = 1
+        left = eval(left)
+        left = left[-40:]
+        right = eval(right)
+        right = right[:40]
+        context = [left, right]
+        # x = embedding(context, shape=input_shape2)
+        data_x.append(context)
+        data_y.append(y)
+
+    _data = [d for d in zip(data_x,data_y)]
+    random.shuffle(_data)
+    data_x = [i[0] for i in _data]
+    data_y = [i[1] for i in _data]
+    test_len = int(len(data_x) * 0.11)
+    test_x = data_x[:test_len]
+    test_y = data_y[:test_len]
+    print("测试数据量:", len(test_x))
+    train_x = data_x[test_len:]
+    train_y = data_y[test_len:]
+
+    # for left, right, label,pre_label,is_same in zip(data_old['context_left'], data_old['context_right'], data_old['label'],
+    #                                                 data_old['pre_label'],data_old['is_same']):
+    #     # if label==0:
+    #     #     if random.random()>0.25:
+    #     #         continue
+    #     if label==0:
+    #         if is_same==1:
+    #             pass
+    #         else:
+    #             if pre_label>3:
+    #                 label = pre_label
+    #             else:
+    #                 continue
+    #     y = np.zeros(output_shape)
+    #     y[label] = 1
+    #     left = eval(left)
+    #     left = left[-40:]
+    #     right = eval(right)
+    #     right = right[:40]
+    #     context = [left, right]
+    #     # x = embedding(context, shape=input_shape2)
+    #     train_x.append(context)
+    #     train_y.append(y)
+    print("训练数据量:", len(train_x))
+
+    # train_y, test_y = np.array(train_y), np.array(test_y)
+    # train_x = np.array(train_x)
+    # test_x = np.array(test_x)
+    # test_x = np.transpose(test_x, (1, 0, 2, 3))
+    # train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
+    training_generator = DataGenerator(train_x, train_y,is_train=True)
+    # training_generator = DataGenerator(data_x, data_y)
+    validation_generator = DataGenerator(test_x, test_y,is_train=False,shuffle=False)
+
+    # model = getModel3()
+    model = getModel2()
+    epochs = 100
+    # batch_size = 256
+    checkpoint = ModelCheckpoint("model_time_classify.weights",save_weights_only=True, monitor="val_loss", verbose=1,
+                                 save_best_only=True, mode='min')
+    # checkpoint = ModelCheckpoint("model_time_classify2.weights",save_weights_only=True, monitor="loss", verbose=1,
+    #                                  save_best_only=True, mode='min')
+
+    history = model.fit_generator(
+        generator=training_generator,
+        validation_data=validation_generator,
+        use_multiprocessing=True, workers=2,
+        epochs=epochs,
+        shuffle=True,
+        callbacks=[checkpoint],
+        class_weight='auto'
+    )
+
 from keras.utils import Sequence,to_categorical
 class DataGenerator(Sequence):
     'Generates data for Keras'
-    def __init__(self, texts, labels, batch_size=256,
-                 n_classes=4, shuffle=True):
+    def __init__(self, texts, labels, is_train=True,batch_size=256,
+                 n_classes=len(time_label_dict), shuffle=True):
         'Initialization'
         # self.dim = dim
         self.batch_size = batch_size
@@ -542,6 +872,7 @@ class DataGenerator(Sequence):
         self.texts = texts
         self.n_classes = n_classes
         self.shuffle = shuffle
+        self.is_train = is_train
         self.on_epoch_end()
 
     def __len__(self):
@@ -583,8 +914,22 @@ class DataGenerator(Sequence):
         # Generate data
         for i, context in enumerate(list_texts):
             # Store sample
-            # tokens = preprocess2(text)
-            # tokens = tokens[:maxlen]
+            if self.is_train:
+                left = context[0]
+                if len(left) > 30:
+                    if random.random() > 0.5:
+                        left = left[2:]
+                elif len(left) > 15:
+                    if random.random() > 0.5:
+                        left = left[1:]
+                right = context[1]
+                if len(right) > 30:
+                    if random.random() > 0.5:
+                        right = right[:-2]
+                elif len(right) > 15:
+                    if random.random() > 0.5:
+                        right = right[:-1]
+                context = [left, right]
             words_matrix = embedding_mywords(context, shape=input_shape2)
             # Store class
             # y[i] = _label[i]
@@ -647,7 +992,11 @@ def predict3():
     new_data.to_excel("new_tokens_data1_res.xlsx")
 
 def predict4():
-    data = pd.read_csv("tokens_tolabel_data1_res11.csv", chunksize=3000)
+    data = pd.read_csv("tokens_data_02_res6New.csv", chunksize=3000)
+    # data = pd.read_excel("C:\\Users\\Administrator\\Desktop\\time_entity4.xlsx")
+    # data.to_csv("C:\\Users\\Administrator\\Desktop\\time_entity4.csv")
+    # data = pd.read_csv("C:\\Users\\Administrator\\Desktop\\time_entity4.csv", chunksize=3000)
+
     model1 = getModel2()
     model1.load_weights("model_time_classify.weights")
     new_data = pd.DataFrame()
@@ -671,14 +1020,15 @@ def predict4():
         pre_y = model1.predict([test_x[0], test_x[1]])
         _data['pre_label'] = [np.argmax(item) for item in pre_y]
         _data['pre_label_prob'] = [max(item) for item in pre_y]
-        _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['re_label'],_data['pre_label'])]
+        _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['label'],_data['pre_label'])]
         # _data['is_same'] = [1 if int(_re)==int(_pre) and int(_re)==int(_label) else 0 for _label,_re,_pre in zip(_data['label'],_data['re_label'],_data['pre_label'])]
         # data['label'] = label
         new_data = pd.concat([new_data, _data])
         idx += 3000
         print(idx)
-    # data.to_csv("new_tokens_data1.csv")
-    new_data.to_excel("tokens_tolabel_data1_res12.xlsx")
+    # new_data.to_csv("tokens_data_02_res7New.csv")
+    new_data.to_excel("tokens_data_02_res7New.xlsx")
+    # new_data.to_excel("C:\\Users\\Administrator\\Desktop\\tokens_data_02_res7New.xlsx")
 
 
 def predict():
@@ -863,7 +1213,7 @@ def save_model():
             test_model = getModel2()
             test_model.load_weights("model_time_classify.weights")
             tf.saved_model.simple_save(sess,
-                                       "models/timesplit_model/",
+                                       "models/timesplit_model2/",
                                        inputs={"input0": test_model.input[0],
                                                "input1":test_model.input[1]
                                                },
@@ -879,6 +1229,7 @@ if __name__ == '__main__':
     # training()
     # train2()
     # train3()
+    # train4()
     # data_process()
     # data_process2()
     # data_process3()

Alguns ficheiros não foram mostrados porque muitos ficheiros mudaram neste diff