3 years ago · 67c7f4f0b1
--- a/BiddingKG/dl/channel/re_channel_103.py
+++ b/BiddingKG/dl/channel/re_channel_103.py
@@ -0,0 +1,219 @@
 
															+import pandas as pd
														
 
															+import re
														
 
															+
														
 
															+# 各投标人
														
 
															+# 各潜在投标人
														
 
															+# 各潜在投标人：
														
 
															+# 致各招标文件持有者：
														
 
															+# 致各投标人
														
 
															+# 各潜在投标供应商：
														
 
															+
														
 
															+# 修改、澄清(答疑)纪要内容如下: 1、
														
 
															+# 答疑澄清与修改的主要内容：
														
 
															+# 对文件澄清与修改的主要内容
														
 
															+# 澄清、修改内容要点
														
 
															+# 答疑纪要
														
 
															+# 答疑如下
														
 
															+# 招标文件答疑和招标文件修改通知
														
 
															+# 招标文件答疑通知
														
 
															+# 答疑及补遗通知
														
 
															+# 答疑回复如下：
														
 
															+# 现对投标人提出的质疑回复如下：
														
 
															+# 对文件澄清与修改的主要内容 详见招标文件
														
 
															+# 修改的主要内容 详见附件
														
 
															+# 澄清或修改事项：
														
 
															+
														
 
															+# 第1次答疑
														
 
															+# 第1次答疑澄清
														
 
															+
														
 
															+# 答疑补遗文件
														
 
															+# 补遗书澄清文件 答疑澄清
														
 
															+# 质疑1
														
 
															+# 问题
														
 
															+# 答疑文件1
														
 
															+# 具体补遗内容详见附件
														
 
															+# 请问 答
														
 
															+# 问题 回复
														
 
															+# 答疑澄清公告 1：
														
 
															+# 现对招标文件作如下澄清：
														
 
															+# 详见答疑澄清文件
														
 
															+# 详见答疑文件。
														
 
															+
														
 
															+
														
 
															+channel_103 = '(澄清|答疑|补遗|修改)'
														
 
															+channel_103_0 = '(致|至|)(各|各个)(潜在|)(投标|招标|招标文件持有|报价|竞选|)(人|者|供应商|单位)(:|：)'
														
 
															+channel_103_1 = '(澄清|答疑|补遗|修改|质疑)(.?)(具体内容|主要内容|内容|回复|发布|纪要|事项|如下){1,2}(.?)' \
														
 
															+                '(如下|[：:]|详见|点击下载附件|[1一][:：、]|（1）|\\(1\\)|一)'
														
 
															+channel_103_2 = '第(.?)次(答疑|澄清)'
														
 
															+channel_103_3 = '(澄清|答疑|补遗|修改)(公告|文件)'
														
 
															+channel_103_after = '(请问|提问|问题|答复|回复|质疑|答|问){1,2}[12一]?[:：]|[一1][:：、]|（1）|\\(1\\)|(详见|见)(附件|答疑文件|澄清文件|答疑澄清文件)'
														
 
															+channel_103_4 = '(补充答疑|提疑内容|请问|提问|问题|回复|答复|答疑|质疑|答|问)[12一]?[:：]'
														
 
															+channel_103_5 = '(见|详见)(答疑澄清文件|澄清文件|答疑文件)|补遗内容详见附件'
														
 
															+
														
 
															+# 答疑澄清时间
														
 
															+# 对文件澄清与修改的主要内容 无澄清文件
														
 
															+# 对文件澄清与修改的主要内容 无
														
 
															+# 请各投标单位自行下载
														
 
															+not_channel_103 = '答疑澄清时间|主要内容.?无|请各投标单位'
														
 
															+
														
 
															+
														
 
															+def re_standard_channel_103(_str):
														
 
															+    channel_103_list = []
														
 
															+
														
 
															+    if not re.search(channel_103, _str):
														
 
															+        print("not")
														
 
															+        return channel_103_list
														
 
															+
														
 
															+    reg_standard = "(?P<value>" + channel_103_0 + ")"
														
 
															+    match = re.finditer(reg_standard, _str)
														
 
															+    for m in match:
														
 
															+        m_dict = m.groupdict()
														
 
															+        m_span = m.span()
														
 
															+        keyword_index = [m_span[0], m_span[1]]
														
 
															+        keyword = m_dict.get('value')
														
 
															+        channel_103_list.append([keyword, keyword_index])
														
 
															+    if channel_103_list:
														
 
															+        print("0", channel_103_list)
														
 
															+        return channel_103_list
														
 
															+
														
 
															+    reg_standard = "(?P<value>" + channel_103_1 + ")"
														
 
															+    match = re.finditer(reg_standard, _str)
														
 
															+    for m in match:
														
 
															+        m_dict = m.groupdict()
														
 
															+        m_span = m.span()
														
 
															+        keyword_index = [m_span[0], m_span[1]]
														
 
															+        keyword = m_dict.get('value')
														
 
															+        channel_103_list.append([keyword, keyword_index])
														
 
															+    if channel_103_list:
														
 
															+        print("1", channel_103_list)
														
 
															+        return channel_103_list
														
 
															+
														
 
															+    reg_standard = "(?P<value>" + channel_103_2 + ")"
														
 
															+    match = re.finditer(reg_standard, _str)
														
 
															+    for m in match:
														
 
															+        m_dict = m.groupdict()
														
 
															+        m_span = m.span()
														
 
															+        keyword_index = [m_span[0], m_span[1]]
														
 
															+        keyword = m_dict.get('value')
														
 
															+        if re.search(channel_103_after, _str[keyword_index[1]:keyword_index[1]+50]):
														
 
															+            channel_103_list.append([keyword, keyword_index])
														
 
															+    if channel_103_list:
														
 
															+        print("2", channel_103_list)
														
 
															+        return channel_103_list
														
 
															+
														
 
															+    reg_standard = "(?P<value>" + channel_103_3 + ")"
														
 
															+    match = re.finditer(reg_standard, _str)
														
 
															+    for m in match:
														
 
															+        m_dict = m.groupdict()
														
 
															+        m_span = m.span()
														
 
															+        keyword_index = [m_span[0], m_span[1]]
														
 
															+        keyword = m_dict.get('value')
														
 
															+        if re.search(channel_103_after, _str[keyword_index[1]:keyword_index[1]+50]):
														
 
															+            channel_103_list.append([keyword, keyword_index])
														
 
															+    if channel_103_list:
														
 
															+        print("3", channel_103_list)
														
 
															+        return channel_103_list
														
 
															+
														
 
															+    reg_standard = "(?P<value>" + channel_103_4 + ")"
														
 
															+    match = re.finditer(reg_standard, _str)
														
 
															+    for m in match:
														
 
															+        m_dict = m.groupdict()
														
 
															+        m_span = m.span()
														
 
															+        keyword_index = [m_span[0], m_span[1]]
														
 
															+        keyword = m_dict.get('value')
														
 
															+        channel_103_list.append([keyword, keyword_index])
														
 
															+    if channel_103_list:
														
 
															+        print("4", channel_103_list)
														
 
															+        return channel_103_list
														
 
															+
														
 
															+    reg_standard = "(?P<value>" + channel_103_5 + ")"
														
 
															+    match = re.finditer(reg_standard, _str)
														
 
															+    for m in match:
														
 
															+        m_dict = m.groupdict()
														
 
															+        m_span = m.span()
														
 
															+        keyword_index = [m_span[0], m_span[1]]
														
 
															+        keyword = m_dict.get('value')
														
 
															+        channel_103_list.append([keyword, keyword_index])
														
 
															+    if channel_103_list:
														
 
															+        print("5", channel_103_list)
														
 
															+        return channel_103_list
														
 
															+
														
 
															+    return channel_103_list
														
 
															+
														
 
															+
														
 
															+def re_not_channel_103(_str):
														
 
															+    match = re.findall(not_channel_103, _str)
														
 
															+    if match:
														
 
															+        for word in match:
														
 
															+            instead = "#" * len(word)
														
 
															+            _str = re.sub(word, instead, _str)
														
 
															+    return _str
														
 
															+
														
 
															+
														
 
															+def re_channel_103(text):
														
 
															+    # 替换易混淆词
														
 
															+    clean_text = re_not_channel_103(text)
														
 
															+
														
 
															+    # 查找符合标准形式的
														
 
															+    channel_103_list = re_standard_channel_103(clean_text)
														
 
															+    return channel_103_list
														
 
															+
														
 
															+
														
 
															+def extract_channel_103(text):
														
 
															+    result_list = []
														
 
															+    channel_103_list = re_channel_103(text)
														
 
															+    if channel_103_list:
														
 
															+        for word, text_index in channel_103_list:
														
 
															+            if word is not None:
														
 
															+                if text_index[1]-text_index[0] != len(word) \
														
 
															+                        or text_index[1]-text_index[0] >= 20:
														
 
															+                    return []
														
 
															+                d = {"body": word, "begin_index": text_index[0], "end_index": text_index[1]}
														
 
															+                result_list.append(d)
														
 
															+    return result_list
														
 
															+
														
 
															+
														
 
															+def test_csv(_path):
														
 
															+    df = pd.read_csv(_path)
														
 
															+
														
 
															+    predict_list = []
														
 
															+    for index, row in df.iterrows():
														
 
															+        word_list = re_channel_103(row["doctextcon"], "")
														
 
															+        if word_list:
														
 
															+            predict = word_list
														
 
															+        else:
														
 
															+            predict = []
														
 
															+        print("predict", predict)
														
 
															+        predict_list.append(str(predict))
														
 
															+
														
 
															+    predict_df = pd.DataFrame(predict_list)
														
 
															+    df = pd.concat([df, predict_df], axis=1)
														
 
															+
														
 
															+    df.to_csv(_path)
														
 
															+    print("finish write!")
														
 
															+
														
 
															+
														
 
															+def test_str():
														
 
															+    s = '政府采购项目招标方式：公开招标，联系人：黎明。代理机构地址：广州市天河区'
														
 
															+    s = '''
														
 
															+    (第1次澄清） 发布时间：2020-11-25 致各招标文件持有者： 招标人──舟山市
														
 
															+    '''
														
 
															+    print(extract_channel_103(s))
														
 
															+
														
 
															+
														
 
															+def test_html():
														
 
															+    html_path = "C:/Users/Administrator/Desktop/3.html"
														
 
															+
														
 
															+    with open(html_path, "r") as f:
														
 
															+        s = f.read()
														
 
															+
														
 
															+    print(extract_channel_103(s, title=""))
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    path = "D:\\BIDI_DOC\\比地_文档\\澄清答疑_result.csv"
														
 
															+    # test_csv(path)
														
 
															+    test_str()
														
 
															+    # test_html(path)
														
 
															+    pass
														
 
															+
														
--- a/BiddingKG/dl/if_joint_bidding/re_if_joint_bidding.py
+++ b/BiddingKG/dl/if_joint_bidding/re_if_joint_bidding.py
@@ -0,0 +1,231 @@
 
															+import pandas as pd
														
 
															+import re
														
 
															+
														
 
															+# 申请人可以组成联合体报名，联合体的家数最多不能超过两家
														
 
															+# 本项目不接受供应商以联合体方式进行投标。
														
 
															+
														
 
															+bidway = '(参与|)(报价|投标|招标|竞价|报名|参加|资格预审|应答|谈判|磋商|竞标)(登记|)'
														
 
															+
														
 
															+# 是否接收联合体投标： 不接受
														
 
															+# 联合体投标: 不允许
														
 
															+# 是否允许联合体投标登记：是
														
 
															+# （是/否）接受联合体投标：否
														
 
															+# 是否接受联合体投标 不接受
														
 
															+# 是否接受联合体投标:不接受
														
 
															+# 本项目（是/否）接受联合体投标：否
														
 
															+# joint_bidding_prefix_1 = '(不[ ]?|[(（]{0,1}[ ]?[是否不][ ]?[）)]{0,1}|)'
														
 
															+joint_bidding_prefix_1 = "(是否|)"
														
 
															+bidway_1 = bidway
														
 
															+joint_bidding_body_1 = '(允许|接受|接收|)(联合体|独立体或联合体)' + bidway_1
														
 
															+joint_bidding_suffix_1 = '([ :：。]{1,2})(不接受|不接收|不允许|允许|接受|接收|是|否)'
														
 
															+
														
 
															+# 不接受(接受或不接受)联合体投标
														
 
															+# （否）接受联合体。
														
 
															+# （不）接受联合体投标
														
 
															+# （ 不 ）接受联合体。
														
 
															+# 本项目 不 允许联合体投标。
														
 
															+# （否）接受联合体投标
														
 
															+# 本项目不接受联合体参与投标。
														
 
															+# 本合同包接受联合体投标
														
 
															+# 本项目不接受联合体应答，
														
 
															+# 不接受联合体投标
														
 
															+# 否 接受联合体
														
 
															+# 接受 联合体资格预审
														
 
															+# 接受独立体或联合体报名，联合体的家数最多不能超过两家
														
 
															+joint_bidding_prefix_2 = '(不[ ]?|[(（]{0,1}[ ]?[是否不][ ]?[）)]{0,1}|)'
														
 
															+bidway_2 = "(" + bidway + "|)"
														
 
															+joint_bidding_body_2 = '(允许|接受|接收).?(联合体|独立体或联合体)' + bidway_2
														
 
															+joint_bidding_suffix_2 = '([ :：。]{0,2})(不接受|不接收|不允许|允许|接受|接收|是|否|)'
														
 
															+# joint_bidding_suffix_2 = ""
														
 
															+
														
 
															+# 是否允许联合体 不允许
														
 
															+joint_bidding_prefix_3 = '(是否)'
														
 
															+joint_bidding_body_3 = '(允许|接受|接收).?(联合体|独立体或联合体)'
														
 
															+joint_bidding_suffix_3 = '([ :：。]{1,2})(不接受|不接收|不允许|允许|接受|接收|是|否)'
														
 
															+
														
 
															+
														
 
															+# 是否接受联合体投标：（ ）是（√ ）否。
														
 
															+
														
 
															+
														
 
															+# 投标人须知前附表规定接受联合体投标的
														
 
															+# 联合体投标的，
														
 
															+# 允许联合体投标的
														
 
															+# 如项目接受联合体投标
														
 
															+# （是/否）接受联合体投标: 是 否
														
 
															+# 招标□接受 ?不接受联合体投标
														
 
															+# 联合体投标：接受；不接受
														
 
															+# （是/否）
														
 
															+# 是 否
														
 
															+# 接受；不接受
														
 
															+# 接受 ?不接受
														
 
															+# (接受或不接受)
														
 
															+# 是否允许联合体： 1 是 0 否
														
 
															+# 允许联合体报名 □是 ■ 否
														
 
															+not_joint_bidding_1 = '(' \
														
 
															+                      '联合体投标的|如项目接受联合体投标' \
														
 
															+                      '|是否允许联合体： 1 是 0 否' \
														
 
															+                      '|联合体参加的|联合体牵头人|联合体牵头方|联合体成员|联合体（牵头人）' \
														
 
															+                      '|联合体各方|联合体协议' \
														
 
															+                      '|允许联合体报名 □是 ■ 否' \
														
 
															+                      ')'
														
 
															+not_joint_bidding_2 = '(' \
														
 
															+                      '[(（]{0,1}.?是.{1,2}否[)）]{0,1}' \
														
 
															+                      '|[(（]{0,1}.?接受.{0,2}不接受[)）]{0,1}' \
														
 
															+                      '|1 是 0 否' \
														
 
															+                      '|.{1}接受.{1,2}不接受' \
														
 
															+                      ')'
														
 
															+
														
 
															+
														
 
															+def re_not_joint_bidding(_str):
														
 
															+    _str = re.sub(not_joint_bidding_1, "", _str)
														
 
															+    _str = re.sub(not_joint_bidding_2, "", _str)
														
 
															+    return _str
														
 
															+
														
 
															+
														
 
															+def re_standard_joint_bidding(_str):
														
 
															+    # 第一种形式
														
 
															+    reg_standard = "(?P<prefix>" + joint_bidding_prefix_1 + ")" \
														
 
															+                   + "(?P<body>" + joint_bidding_body_1 + ")" \
														
 
															+                   + "(?P<suffix>" + joint_bidding_suffix_1 + ")"
														
 
															+    # print("prefix", re.findall(joint_bidding_prefix_1, _str))
														
 
															+    # print("body", re.search(joint_bidding_body_1, _str))
														
 
															+    # print("suffix", re.search(joint_bidding_suffix_1, _str))
														
 
															+    match = re.finditer(reg_standard, _str)
														
 
															+    joint_bidding_list = []
														
 
															+    for m in match:
														
 
															+        m_dict = m.groupdict()
														
 
															+        m_span = m.span()
														
 
															+        keyword = ""
														
 
															+        keyword += m_dict.get("prefix") + m_dict.get("body") + m_dict.get("suffix")
														
 
															+        joint_bidding_list.append([keyword, m_span[0], m_span[1]])
														
 
															+    if joint_bidding_list:
														
 
															+        return joint_bidding_list
														
 
															+
														
 
															+    # 第二种形式
														
 
															+    reg_standard = "(?P<prefix>" + joint_bidding_prefix_2 + ")" \
														
 
															+                   + "(?P<body>" + joint_bidding_body_2 + ")" \
														
 
															+                   + "(?P<suffix>" + joint_bidding_suffix_2 + ")"
														
 
															+    match = re.finditer(reg_standard, _str)
														
 
															+    # print("prefix", re.findall(joint_bidding_prefix_2, _str))
														
 
															+    # print("body", re.search(joint_bidding_body_2, "接受 联合体资格预审"))
														
 
															+    # print("suffix", re.search(joint_bidding_suffix_2, _str))
														
 
															+    joint_bidding_list = []
														
 
															+    for m in match:
														
 
															+        m_dict = m.groupdict()
														
 
															+        m_span = m.span()
														
 
															+        keyword = ""
														
 
															+        keyword += m_dict.get("prefix") + m_dict.get("body") + m_dict.get("suffix")
														
 
															+        # 排除 '是否' 的情况
														
 
															+        if _str[m_span[0]-1:m_span[0]] != "是":
														
 
															+            joint_bidding_list.append([keyword, [m_span[0], m_span[1]]])
														
 
															+    if joint_bidding_list:
														
 
															+        return joint_bidding_list
														
 
															+
														
 
															+    # 第三种形式
														
 
															+    reg_standard = "(?P<prefix>" + joint_bidding_prefix_3 + ")" \
														
 
															+                   + "(?P<body>" + joint_bidding_body_3 + ")" \
														
 
															+                   + "(?P<suffix>" + joint_bidding_suffix_3 + ")"
														
 
															+    match = re.finditer(reg_standard, _str)
														
 
															+    # print("prefix", re.findall(joint_bidding_prefix_2, _str))
														
 
															+    # print("body", re.search(joint_bidding_body_2, "接受 联合体资格预审"))
														
 
															+    # print("suffix", re.search(joint_bidding_suffix_2, _str))
														
 
															+    joint_bidding_list = []
														
 
															+    for m in match:
														
 
															+        m_dict = m.groupdict()
														
 
															+        m_span = m.span()
														
 
															+        keyword = ""
														
 
															+        keyword += m_dict.get("prefix") + m_dict.get("body") + m_dict.get("suffix")
														
 
															+        joint_bidding_list.append([keyword, [m_span[0], m_span[1]]])
														
 
															+    if joint_bidding_list:
														
 
															+        return joint_bidding_list
														
 
															+
														
 
															+    return joint_bidding_list
														
 
															+
														
 
															+
														
 
															+def re_joint_bidding(text):
														
 
															+    # 替换小括号
														
 
															+    text_clean = re.sub("\\(", "（", text)
														
 
															+    text_clean = re.sub("\\)", "）", text_clean)
														
 
															+
														
 
															+    # 替换易混淆词
														
 
															+    text_clean = re_not_joint_bidding(text_clean)
														
 
															+    # print("clean", text_clean)
														
 
															+
														
 
															+    # 查找符合标准形式的
														
 
															+    joint_bidding_list = re_standard_joint_bidding(text_clean)
														
 
															+    return joint_bidding_list
														
 
															+
														
 
															+
														
 
															+def judge_joint_bidding(_list):
														
 
															+    new_list = []
														
 
															+    for l in _list:
														
 
															+        if "否" in l[0] or "不" in l[0]:
														
 
															+            new_list.append(["0" + " " + l[0], l[1]])
														
 
															+        else:
														
 
															+            new_list.append(["1" + " " + l[0], l[1]])
														
 
															+
														
 
															+    return new_list
														
 
															+
														
 
															+
														
 
															+def extract_joint_bidding(text):
														
 
															+    result_list = []
														
 
															+    joint_bidding_list = re_joint_bidding(text)
														
 
															+    joint_bidding_list = judge_joint_bidding(joint_bidding_list)
														
 
															+    if joint_bidding_list:
														
 
															+        for word, text_index in joint_bidding_list:
														
 
															+            if word is not None:
														
 
															+                d = {"body": word, "begin_index": text_index[0], "end_index": text_index[1]}
														
 
															+                result_list.append(d)
														
 
															+    return result_list
														
 
															+
														
 
															+
														
 
															+def test_csv(_path):
														
 
															+    df = pd.read_csv(_path)
														
 
															+
														
 
															+    predict_list = []
														
 
															+    for index, row in df.iterrows():
														
 
															+        word_list = re_joint_bidding(row["doctextcon"])
														
 
															+        if word_list:
														
 
															+            predict = word_list
														
 
															+        else:
														
 
															+            predict = []
														
 
															+        print("predict", predict)
														
 
															+        predict_list.append(str(predict))
														
 
															+
														
 
															+    predict_df = pd.DataFrame(predict_list)
														
 
															+    df = pd.concat([df, predict_df], axis=1)
														
 
															+
														
 
															+    df.to_csv(_path)
														
 
															+    print("finish write!")
														
 
															+
														
 
															+
														
 
															+def test_str():
														
 
															+    # （不）接受联合体投标
														
 
															+    # 本项目不接受供应商以联合体方式进行投标。
														
 
															+    # （否）接受联合体。
														
 
															+    # 是否接收联合体投标： 不接受
														
 
															+    # 联合体投标: 不允许
														
 
															+    # 是否允许联合体投标登记：是
														
 
															+    s = '政府采购项目招标方式：公开招标，联系人：黎明。代理机构地址：广州市天河区'
														
 
															+    s = '''
														
 
															+测绘服务 是否允许联合体 不允许 行业
														
 
															+    '''
														
 
															+    print(extract_joint_bidding(s))
														
 
															+
														
 
															+
														
 
															+def test_html(_path):
														
 
															+    html_path = _path
														
 
															+
														
 
															+    with open(html_path, "r") as f:
														
 
															+        s = f.read()
														
 
															+
														
 
															+    print(extract_joint_bidding(s, title=""))
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    path = "D:\\BIDI_DOC\\比地_文档\\投标工期_result.csv"
														
 
															+    test_csv(path)
														
 
															+    # test_str()
														
 
															+    # test_html(path)
														
 
															+    pass
														
 
															+
														
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -4,7 +4,6 @@ Created on 2019年1月4日
 
															 @author: User
														
 
															 '''
														
 
															 import os
														
 
															-
														
 
															 from bs4 import BeautifulSoup, Comment
														
 
															 import copy
														
 
															 import re
														
@@ -24,10 +23,11 @@ import BiddingKG.dl.interface.Preprocessing as Preprocessing
 
															 import BiddingKG.dl.interface.getAttributes as getAttributes
														
 
															 import BiddingKG.dl.complaint.punish_predictor as punish_rule
														
 
															 import json
														
 
															+from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
														
 
															+from BiddingKG.dl.ratio.re_ratio import extract_ratio
														
 
															-
														
 
															-#自定义jsonEncoder
														
 
															+# 自定义jsonEncoder
														
 
															 class MyEncoder(json.JSONEncoder):
														
 
															     def default(self, obj):
														
 
															         if isinstance(obj, np.ndarray):
														
@@ -41,39 +41,40 @@ class MyEncoder(json.JSONEncoder):
 
															             return obj
														
 
															         return json.JSONEncoder.default(self, obj)
														
 
															+
														
 
															 def predict(doc_id,text,title="",page_time="",**kwargs):
														
 
															     cost_time = dict()
														
 
															     start_time = time.time()
														
 
															-    log("start process doc %s"%(str(doc_id)))
														
 
															+    # log("start process doc %s"%(str(doc_id)))
														
 
															     list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time]],useselffool=True)
														
 
															-    log("get preprocessed done of doc_id%s"%(doc_id))
														
 
															+    # log("get preprocessed done of doc_id%s"%(doc_id))
														
 
															     cost_time["preprocess"] = round(time.time()-start_time,2)
														
 
															     cost_time.update(_cost_time)
														
 
															-    #依赖句子顺序
														
 
															+    # 依赖句子顺序
														
 
															     start_time = time.time()
														
 
															     list_channel_dic = predictor.getPredictor("channel").predict(title=title, content=list_sentences[0])
														
 
															     cost_time["channel"] = round(time.time()-start_time,2)
														
 
															     start_time = time.time()
														
 
															     codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
														
 
															-    log("get codename done of doc_id%s"%(doc_id))
														
 
															+    # log("get codename done of doc_id%s"%(doc_id))
														
 
															     cost_time["codename"] = round(time.time()-start_time,2)
														
 
															     start_time = time.time()
														
 
															     predictor.getPredictor("prem").predict(list_sentences,list_entitys)
														
 
															-    log("get prem done of doc_id%s"%(doc_id))
														
 
															+    # log("get prem done of doc_id%s"%(doc_id))
														
 
															     cost_time["prem"] = round(time.time()-start_time,2)
														
 
															     start_time = time.time()
														
 
															     predictor.getPredictor("product").predict(list_sentences,list_entitys)
														
 
															-    log("get product done of doc_id%s"%(doc_id))
														
 
															+    # log("get product done of doc_id%s"%(doc_id))
														
 
															     cost_time["product"] = round(time.time()-start_time,2)
														
 
															     start_time = time.time()
														
 
															     product_attrs = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
														
 
															-    log("get product attributes done of doc_id%s"%(doc_id))
														
 
															+    # log("get product attributes done of doc_id%s"%(doc_id))
														
 
															     cost_time["product_attrs"] = round(time.time()-start_time,2)
														
 
															     start_time = time.time()
														
@@ -82,12 +83,12 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
 
															     start_time = time.time()
														
 
															     predictor.getPredictor("epc").predict(list_sentences,list_entitys)
														
 
															-    log("get epc done of doc_id%s"%(doc_id))
														
 
															+    # log("get epc done of doc_id%s"%(doc_id))
														
 
															     cost_time["person"] = round(time.time()-start_time,2)
														
 
															     start_time = time.time()
														
 
															     predictor.getPredictor("time").predict(list_sentences, list_entitys)
														
 
															-    log("get time done of doc_id%s"%(doc_id))
														
 
															+    # log("get time done of doc_id%s"%(doc_id))
														
 
															     cost_time["time"] = round(time.time()-start_time,2)
														
 
															     # 需在getPredictor("prem")后  getAttributes.getPREMs 前
														
@@ -104,11 +105,46 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
 
															                         _entity.values[1] = 0.51
														
 
															                         _entity.set_Money(1, _entity.values)
														
 
															-    #依赖句子顺序
														
 
															+    # 2021-12-08新增：提取：总价,单价,比率
														
 
															+    total_money_list = []
														
 
															+    unit_money_list = []
														
 
															+    ratio_list = []
														
 
															+    for i in range(len(list_entitys)):
														
 
															+        list_entity = list_entitys[i]
														
 
															+
														
 
															+        # 总价单价
														
 
															+        for _entity in list_entity:
														
 
															+            if _entity.entity_type == 'money':
														
 
															+                word_of_sentence = list_sentences[i][_entity.sentence_index].sentence_text
														
 
															+                # 总价在中投标金额中
														
 
															+                if _entity.label == 1:
														
 
															+                    result = extract_total_money(word_of_sentence,
														
 
															+                                                 _entity.entity_text,
														
 
															+                                                 [_entity.wordOffset_begin, _entity.wordOffset_end])
														
 
															+                    if result:
														
 
															+                        total_money_list.append(result)
														
 
															+
														
 
															+                # 单价在普通金额中
														
 
															+                else:
														
 
															+                    result = extract_unit_money(word_of_sentence,
														
 
															+                                                _entity.entity_text,
														
 
															+                                                [_entity.wordOffset_begin, _entity.wordOffset_end])
														
 
															+                    if result:
														
 
															+                        unit_money_list.append(result)
														
 
															+
														
 
															+        # 比率
														
 
															+        all_sentence = ""
														
 
															+        for sentence in list_sentences[i]:
														
 
															+            all_sentence += sentence.sentence_text + "，"
														
 
															+        result = extract_ratio(all_sentence)
														
 
															+        if result:
														
 
															+            ratio_list.append(result)
														
 
															+
														
 
															+    # 依赖句子顺序
														
 
															     start_time = time.time()
														
 
															     entityLink.link_entitys(list_entitys)
														
 
															     prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
														
 
															-    log("get attributes done of doc_id%s"%(doc_id))
														
 
															+    # log("get attributes done of doc_id%s"%(doc_id))
														
 
															     cost_time["attrs"] = round(time.time()-start_time,2)
														
 
															     start_time = time.time()
														
@@ -121,13 +157,17 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
 
															                 if product in d['project_name']:
														
 
															                     d['product'].append(product)  #把产品在项目名称中的添加进需求要素中
														
 
															-    #print(prem)
														
 
															+    # print(prem)
														
 
															     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
														
 
															     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
														
 
															     data_res = dict(codeName[0], **prem[0], **list_channel_dic[0], **product_attrs[0], **product_attrs[1])
														
 
															     data_res["cost_time"] = cost_time
														
 
															     data_res["success"] = True
														
 
															+    data_res["total_money"] = total_money_list
														
 
															+    data_res["unit_money"] = unit_money_list
														
 
															+    data_res["ratio"] = ratio_list
														
 
															+
														
 
															     # for _article in list_articles:
														
 
															     #     log(_article.content)
														
 
															     #
														
--- a/BiddingKG/dl/money/re_money_total_unit.py
+++ b/BiddingKG/dl/money/re_money_total_unit.py
@@ -0,0 +1,130 @@
 
															+import json
														
 
															+import pandas as pd
														
 
															+import re
														
 
															+from bs4 import BeautifulSoup
														
 
															+
														
 
															+# 总价
														
 
															+total_money = '(合计.?金额|合.?计|总.?价)'
														
 
															+# 单价
														
 
															+unit_money = '(单价|([0-9.，,]+([（(]?元[)）]?)?/))'
														
 
															+
														
 
															+
														
 
															+def re_standard_total(_str):
														
 
															+    reg_standard = "(?P<value>" + total_money + ")"
														
 
															+    match = re.finditer(reg_standard, _str)
														
 
															+    total_money_list = []
														
 
															+    if match:
														
 
															+        for m in match:
														
 
															+            m_dict = m.groupdict()
														
 
															+            m_span = m.span()
														
 
															+            keyword_index = [m_span[0], m_span[1]]
														
 
															+            keyword = m_dict.get("value")
														
 
															+            # total_money_list.append([keyword, keyword_index])
														
 
															+            total_money_list.append([keyword, keyword_index, _str])
														
 
															+
														
 
															+    return total_money_list
														
 
															+
														
 
															+
														
 
															+def re_standard_unit(_str):
														
 
															+    reg_standard = "(?P<value>" + unit_money + ")"
														
 
															+    match = re.finditer(reg_standard, _str)
														
 
															+    unit_money_list = []
														
 
															+    if match:
														
 
															+        for m in match:
														
 
															+            m_dict = m.groupdict()
														
 
															+            m_span = m.span()
														
 
															+            keyword_index = [m_span[0], m_span[1]]
														
 
															+            keyword = m_dict.get("value")
														
 
															+            # unit_money_list.append([keyword, keyword_index])
														
 
															+            unit_money_list.append([keyword, keyword_index, _str])
														
 
															+
														
 
															+    return unit_money_list
														
 
															+
														
 
															+
														
 
															+def re_total(text, money, index):
														
 
															+    # 对已提取的中投标金额的前面文字进行正则
														
 
															+    prefix_threshold = 10
														
 
															+    suffix_threshold = 10
														
 
															+    # if index_threshold < index[0]:
														
 
															+    #     money_text = text[index[0]-index_threshold:index[0]]
														
 
															+    #     print("total", money, text[index[0]-index_threshold:index[1]], money_text)
														
 
															+    # else:
														
 
															+    #     money_text = text[:index[0]]
														
 
															+    #     print("total", money, text[:index[1]], money_text)
														
 
															+
														
 
															+    prefix_index = index[0] - prefix_threshold
														
 
															+    suffix_index = index[1] + suffix_threshold
														
 
															+    money_text = text[prefix_index if prefix_index > 0 else 0:
														
 
															+                      suffix_index if suffix_index < len(text) else len(text)]
														
 
															+
														
 
															+    # 查找符合标准形式的 总价
														
 
															+    total_money_list = re_standard_total(money_text)
														
 
															+    return total_money_list
														
 
															+
														
 
															+
														
 
															+def re_unit(text, money, index):
														
 
															+    # 对已提取的中投标金额的前面文字进行正则
														
 
															+    prefix_threshold = 10
														
 
															+    suffix_threshold = 10
														
 
															+    # if prefix_threshold < index[0]:
														
 
															+    #     money_text = text[index[0]-prefix_threshold:index[0]]
														
 
															+    #     print("unit", money, text[index[0]-prefix_threshold:index[1]], money_text)
														
 
															+    # else:
														
 
															+    #     money_text = text[:index[0]]
														
 
															+    #     print("unit", money, text[:index[1]], money_text)
														
 
															+
														
 
															+    prefix_index = index[0] - prefix_threshold
														
 
															+    suffix_index = index[1] + suffix_threshold
														
 
															+    money_text = text[prefix_index if prefix_index > 0 else 0:
														
 
															+                      suffix_index if suffix_index < len(text) else len(text)]
														
 
															+
														
 
															+    # 查找符合标准形式的 单价
														
 
															+    unit_money_list = re_standard_unit(money_text)
														
 
															+    return unit_money_list
														
 
															+
														
 
															+
														
 
															+def extract_total_money(text, money, index):
														
 
															+    result_list = []
														
 
															+    total_money_list = re_total(text, money, index)
														
 
															+    if total_money_list:
														
 
															+        for word, text_index, context in total_money_list:
														
 
															+            d = {"body": word, "begin_index": text_index[0],
														
 
															+                 "end_index": text_index[1], "context": context}
														
 
															+            result_list.append(d)
														
 
															+    return result_list
														
 
															+
														
 
															+
														
 
															+def extract_unit_money(text, money, index):
														
 
															+    result_list = []
														
 
															+    unit_money_list = re_unit(text, money, index)
														
 
															+    if unit_money_list:
														
 
															+        for word, text_index, context in unit_money_list:
														
 
															+            d = {"body": word, "begin_index": text_index[0],
														
 
															+                 "end_index": text_index[1], "context": context}
														
 
															+            result_list.append(d)
														
 
															+    return result_list
														
 
															+
														
 
															+
														
 
															+def test_str():
														
 
															+    s = '政府采购项目招标方式：公开招标，联系人：黎明。代理机构地址：广州市天河区'
														
 
															+    s = '往往，20(元)/平方'
														
 
															+    print(extract_unit_money(s, "785.0", [6, 11]))
														
 
															+
														
 
															+
														
 
															+def test_html():
														
 
															+    html_path = "C:/Users/Administrator/Desktop/3.html"
														
 
															+
														
 
															+    with open(html_path, "r") as f:
														
 
															+        s = f.read()
														
 
															+
														
 
															+    print(extract_total_money(s))
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    # extract_bidway(s)
														
 
															+
														
 
															+    path = "D:\\BIDI_DOC\\比地_文档\\总价单价_result.csv"
														
 
															+    test_str()
														
 
															+    # test_html(path)
														
 
															+    pass
														
 
															+
														
--- a/BiddingKG/dl/money/test_re_money_total_unit.py
+++ b/BiddingKG/dl/money/test_re_money_total_unit.py
@@ -0,0 +1,75 @@
 
															+import json
														
 
															+import re
														
 
															+import sys, os
														
 
															+import time
														
 
															+
														
 
															+import pandas as pd
														
 
															+from bs4 import BeautifulSoup
														
 
															+sys.path.append(os.path.abspath("../.."))
														
 
															+from BiddingKG.dl.interface.extract import predict
														
 
															+
														
 
															+
														
 
															+def bidi_predict(html_str):
														
 
															+    content = html_str
														
 
															+    # content = "<div>总价：1110</div>"
														
 
															+    result_dict = json.loads(predict("1", content))
														
 
															+    return result_dict
														
 
															+
														
 
															+
														
 
															+def test_csv(_path):
														
 
															+    start_time = time.time()
														
 
															+    df = pd.read_csv(_path)
														
 
															+
														
 
															+    # total money
														
 
															+    predict_list_1 = []
														
 
															+    predict_list_2 = []
														
 
															+    for index, row in df.iterrows():
														
 
															+        # if index >= 1000:
														
 
															+        #     break
														
 
															+
														
 
															+        if index % 50 == 0:
														
 
															+            print("="*30, "Loop", index, "="*30)
														
 
															+
														
 
															+        html_str = row["dochtmlcon"]
														
 
															+        # html_str = df.loc[75, "dochtmlcon"]
														
 
															+        # print(html_str)
														
 
															+
														
 
															+        # 先筛选
														
 
															+        # possible = '((合计.?金额|合.?计|总.?价|单.?价)(（元）)?([:： ]))' \
														
 
															+        #            '|([0-9.，,]+([（(]?元[)）]?)?/)'
														
 
															+        # if not re.search(possible, html_str):
														
 
															+        #     predict_list_1.append(str([]))
														
 
															+        #     predict_list_2.append(str([]))
														
 
															+        #     continue
														
 
															+
														
 
															+        # 先经过模型处理
														
 
															+        result_dict = bidi_predict(html_str)
														
 
															+
														
 
															+        # 获取总价单价
														
 
															+        word_list_1 = result_dict.get("total_money")
														
 
															+        word_list_2 = result_dict.get("unit_money")
														
 
															+
														
 
															+        if word_list_1:
														
 
															+            predict = word_list_1
														
 
															+        else:
														
 
															+            predict = []
														
 
															+        print("predict total money", predict)
														
 
															+        predict_list_1.append(str(predict))
														
 
															+
														
 
															+        if word_list_2:
														
 
															+            predict = word_list_2
														
 
															+        else:
														
 
															+            predict = []
														
 
															+        print("predict unit money", predict)
														
 
															+        predict_list_2.append(str(predict))
														
 
															+
														
 
															+    predict_df_1 = pd.DataFrame(predict_list_1)
														
 
															+    predict_df_2 = pd.DataFrame(predict_list_2)
														
 
															+    df = pd.concat([df, predict_df_1, predict_df_2], axis=1)
														
 
															+    df.to_csv(_path)
														
 
															+    print("finish write!", time.time()-start_time)
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    path = "D:\\BIDI_DOC\\比地_文档\\总价单价_result.csv"
														
 
															+    test_csv(path)
														
--- a/BiddingKG/dl/offer_type/re_offer_type.py
+++ b/BiddingKG/dl/offer_type/re_offer_type.py
@@ -0,0 +1,28 @@
 
															+import pandas as pd
														
 
															+import re
														
 
															+
														
 
															+# 报价类型为总价报价
														
 
															+# 报价类型： 闭口价
														
 
															+# 报价类型：国内含税价/人民币
														
 
															+# 报价类型：国内含税价;人民币
														
 
															+# 报价类型： 浮动价
														
 
															+# 报价类型 含税含运费
														
 
															+# 报价类型 单个商品报价
														
 
															+# 报价类型：单个标的报单价
														
 
															+# 报价类型：多个标的报总价，
														
 
															+# 报价类型：不含税（到厂）
														
 
															+# 报价类型： 金额
														
 
															+# 报价类型 含税含运费
														
 
															+# 报价类型：单个标的报单价
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+# 报价类型：
														
 
															+
														
 
															+
														
--- a/BiddingKG/dl/ratio/re_ratio.py
+++ b/BiddingKG/dl/ratio/re_ratio.py
@@ -0,0 +1,60 @@
 
															+import re
														
 
															+
														
 
															+ratio = '((上浮|下浮)(率|).{0,2}[0-9.]+%)'
														
 
															+
														
 
															+
														
 
															+def re_standard_ratio(_str):
														
 
															+    reg_standard = "(?P<value>" + ratio + ")"
														
 
															+    match = re.finditer(reg_standard, _str)
														
 
															+    ratio_list = []
														
 
															+    if match:
														
 
															+        for m in match:
														
 
															+            m_dict = m.groupdict()
														
 
															+            m_span = m.span()
														
 
															+            keyword_index = [m_span[0], m_span[1]]
														
 
															+            keyword = m_dict.get("value")
														
 
															+            ratio_list.append([keyword, keyword_index])
														
 
															+
														
 
															+    return ratio_list
														
 
															+
														
 
															+
														
 
															+def re_ratio(text):
														
 
															+    # 查找符合标准形式的 总价
														
 
															+    ratio_list = re_standard_ratio(text)
														
 
															+    return ratio_list
														
 
															+
														
 
															+
														
 
															+def extract_ratio(text):
														
 
															+    result_list = []
														
 
															+    total_money_list = re_ratio(text)
														
 
															+    if total_money_list:
														
 
															+        for word, text_index in total_money_list:
														
 
															+            d = {"body": word, "begin_index": text_index[0],
														
 
															+                 "end_index": text_index[1]}
														
 
															+            result_list.append(d)
														
 
															+    return result_list
														
 
															+
														
 
															+
														
 
															+def test_str():
														
 
															+    s = '政府采购项目招标方式：公开招标，联系人：黎明。代理机构地址：广州市天河区'
														
 
															+    s = '年利率较基准利率的上浮率：30% 活期存款年利率：0.455% 协定存'
														
 
															+    print(extract_ratio(s))
														
 
															+
														
 
															+
														
 
															+def test_html():
														
 
															+    html_path = "C:/Users/Administrator/Desktop/3.html"
														
 
															+
														
 
															+    with open(html_path, "r") as f:
														
 
															+        s = f.read()
														
 
															+
														
 
															+    print(extract_ratio(s))
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    # extract_bidway(s)
														
 
															+
														
 
															+    # path = "D:\\BIDI_DOC\\比地_文档\\比率_result.csv"
														
 
															+    test_str()
														
 
															+    # test_html(path)
														
 
															+    pass
														
 
															+
														
--- a/BiddingKG/dl/ratio/test_re_ratio.py
+++ b/BiddingKG/dl/ratio/test_re_ratio.py
@@ -0,0 +1,61 @@
 
															+import json
														
 
															+import sys, os
														
 
															+import time
														
 
															+import pandas as pd
														
 
															+sys.path.append(os.path.abspath("../../.."))
														
 
															+print("sys.path[-1]", sys.path[-1])
														
 
															+from BiddingKG.dl.interface.extract import predict
														
 
															+
														
 
															+
														
 
															+def bidi_predict(html_str):
														
 
															+    content = html_str
														
 
															+    result_dict = json.loads(predict("1", content))
														
 
															+    return result_dict
														
 
															+
														
 
															+
														
 
															+def test_csv(_path):
														
 
															+    start_time = time.time()
														
 
															+    df = pd.read_csv(_path)
														
 
															+
														
 
															+    # ratio, total_money, unit_money
														
 
															+    predict_list_1 = []
														
 
															+    predict_list_2 = []
														
 
															+    predict_list_3 = []
														
 
															+    for index, row in df.iterrows():
														
 
															+        if index >= 1000:
														
 
															+            break
														
 
															+
														
 
															+        if index % 50 == 0:
														
 
															+            print("="*30, "Loop", index, time.time()-start_time, "="*30)
														
 
															+
														
 
															+        html_str = row["dochtmlcon"]
														
 
															+
														
 
															+        # 先经过模型处理
														
 
															+        result_dict = bidi_predict(html_str)
														
 
															+
														
 
															+        # 获取比率总价单价
														
 
															+        word_list_1 = result_dict.get("total_money")
														
 
															+        word_list_2 = result_dict.get("unit_money")
														
 
															+        word_list_3 = result_dict.get("ratio")
														
 
															+
														
 
															+        # print("predict ratio", word_list_3)
														
 
															+        predict_list_3.append(str(word_list_3))
														
 
															+
														
 
															+        # print("predict total money", word_list_1)
														
 
															+        predict_list_1.append(str(word_list_1))
														
 
															+
														
 
															+        # print("predict unit money", word_list_2)
														
 
															+        predict_list_2.append(str(word_list_2))
														
 
															+
														
 
															+    predict_df_1 = pd.DataFrame(predict_list_1)
														
 
															+    predict_df_2 = pd.DataFrame(predict_list_2)
														
 
															+    predict_df_3 = pd.DataFrame(predict_list_3)
														
 
															+    df = pd.concat([df, predict_df_3, predict_df_1, predict_df_2], axis=1)
														
 
															+    df.to_csv(_path)
														
 
															+    print("finish write!", time.time()-start_time)
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    # path = "D:\\BIDI_DOC\\比地_文档\\比率_result.csv"
														
 
															+    path = '比率_result.csv'
														
 
															+    test_csv(path)
														
--- a/BiddingKG/dl/test/test4.py
+++ b/BiddingKG/dl/test/test4.py
@@ -39,7 +39,7 @@ if __name__=="__main__":
 
															     # filename = "比地_52_79929693.html"
														
 
															     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
														
 
															     # text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
														
 
															-    text = codecs.open("C:\\Users\\Administrator\\Desktop\\test12354.txt", "r", encoding="utf8").read()
														
 
															+    text = codecs.open("C:\\Users\\Administrator\\Desktop\\2.html", "r", encoding="utf8").read()
														
 
															     content = str(BeautifulSoup(text).find("div",id="pcontent"))
														
 
															     # df_a = {"html":[]}
														
 
															     # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
														
@@ -69,6 +69,7 @@ if __name__=="__main__":
 
															     # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
														
 
															     # print(predict("12", text))
														
 
															     print(predict("12", content))
														
 
															+    print(predict("12", content))
														
 
															     # test("12",text)
														
 
															     # test("12",content)
														
 
															     print("takes",time.time()-_time1)
														
--- a/BiddingKG/dl/time/re_servicetime.py
+++ b/BiddingKG/dl/time/re_servicetime.py