import pandas as pd import re # 各投标人 # 各潜在投标人 # 各潜在投标人: # 致各招标文件持有者: # 致各投标人 # 各潜在投标供应商: # 修改、澄清(答疑)纪要内容如下: 1、 # 答疑澄清与修改的主要内容: # 对文件澄清与修改的主要内容 # 澄清、修改内容要点 # 答疑纪要 # 答疑如下 # 招标文件答疑和招标文件修改通知 # 招标文件答疑通知 # 答疑及补遗通知 # 答疑回复如下: # 现对投标人提出的质疑回复如下: # 对文件澄清与修改的主要内容 详见招标文件 # 修改的主要内容 详见附件 # 澄清或修改事项: # 第1次答疑 # 第1次答疑澄清 # 答疑补遗文件 # 补遗书澄清文件 答疑澄清 # 质疑1 # 问题 # 答疑文件1 # 具体补遗内容详见附件 # 请问 答 # 问题 回复 # 答疑澄清公告 1: # 现对招标文件作如下澄清: # 详见答疑澄清文件 # 详见答疑文件。 channel_103 = '(澄清|答疑|补遗|修改)' channel_103_0 = '(致|至|)(各|各个)(潜在|)(投标|招标|招标文件持有|报价|竞选|)(人|者|供应商|单位)(:|:)' channel_103_1 = '(澄清|答疑|补遗|修改|质疑)(.?)(具体内容|主要内容|内容|回复|发布|纪要|事项|如下){1,2}(.?)' \ '(如下|[::]|详见|点击下载附件|[1一][::、]|(1)|\\(1\\)|一)' channel_103_2 = '第(.?)次(答疑|澄清)' channel_103_3 = '(澄清|答疑|补遗|修改)(公告|文件)' channel_103_after = '(请问|提问|问题|答复|回复|质疑|答|问){1,2}[12一]?[::]|[一1][::、]|(1)|\\(1\\)|(详见|见)(附件|答疑文件|澄清文件|答疑澄清文件)' channel_103_4 = '(补充答疑|提疑内容|请问|提问|问题|回复|答复|答疑|质疑|答|问)[12一]?[::]' channel_103_5 = '(见|详见)(答疑澄清文件|澄清文件|答疑文件)|补遗内容详见附件' # 答疑澄清时间 # 对文件澄清与修改的主要内容 无澄清文件 # 对文件澄清与修改的主要内容 无 # 请各投标单位自行下载 not_channel_103 = '答疑澄清时间|主要内容.?无|请各投标单位' def re_standard_channel_103(_str): channel_103_list = [] if not re.search(channel_103, _str): print("not") return channel_103_list reg_standard = "(?P" + channel_103_0 + ")" match = re.finditer(reg_standard, _str) for m in match: m_dict = m.groupdict() m_span = m.span() keyword_index = [m_span[0], m_span[1]] keyword = m_dict.get('value') channel_103_list.append([keyword, keyword_index]) if channel_103_list: print("0", channel_103_list) return channel_103_list reg_standard = "(?P" + channel_103_1 + ")" match = re.finditer(reg_standard, _str) for m in match: m_dict = m.groupdict() m_span = m.span() keyword_index = [m_span[0], m_span[1]] keyword = m_dict.get('value') channel_103_list.append([keyword, keyword_index]) if channel_103_list: print("1", channel_103_list) return channel_103_list reg_standard = "(?P" + channel_103_2 + ")" match = re.finditer(reg_standard, _str) for m in match: m_dict = m.groupdict() m_span = m.span() keyword_index = [m_span[0], m_span[1]] keyword = m_dict.get('value') if re.search(channel_103_after, _str[keyword_index[1]:keyword_index[1]+50]): channel_103_list.append([keyword, keyword_index]) if channel_103_list: print("2", channel_103_list) return channel_103_list reg_standard = "(?P" + channel_103_3 + ")" match = re.finditer(reg_standard, _str) for m in match: m_dict = m.groupdict() m_span = m.span() keyword_index = [m_span[0], m_span[1]] keyword = m_dict.get('value') if re.search(channel_103_after, _str[keyword_index[1]:keyword_index[1]+50]): channel_103_list.append([keyword, keyword_index]) if channel_103_list: print("3", channel_103_list) return channel_103_list reg_standard = "(?P" + channel_103_4 + ")" match = re.finditer(reg_standard, _str) for m in match: m_dict = m.groupdict() m_span = m.span() keyword_index = [m_span[0], m_span[1]] keyword = m_dict.get('value') channel_103_list.append([keyword, keyword_index]) if channel_103_list: print("4", channel_103_list) return channel_103_list reg_standard = "(?P" + channel_103_5 + ")" match = re.finditer(reg_standard, _str) for m in match: m_dict = m.groupdict() m_span = m.span() keyword_index = [m_span[0], m_span[1]] keyword = m_dict.get('value') channel_103_list.append([keyword, keyword_index]) if channel_103_list: print("5", channel_103_list) return channel_103_list return channel_103_list def re_not_channel_103(_str): match = re.findall(not_channel_103, _str) if match: for word in match: instead = "#" * len(word) _str = re.sub(word, instead, _str) return _str def re_channel_103(text): # 替换易混淆词 clean_text = re_not_channel_103(text) # 查找符合标准形式的 channel_103_list = re_standard_channel_103(clean_text) return channel_103_list def extract_channel_103(text): result_list = [] channel_103_list = re_channel_103(text) if channel_103_list: for word, text_index in channel_103_list: if word is not None: if text_index[1]-text_index[0] != len(word) \ or text_index[1]-text_index[0] >= 20: return [] d = {"body": word, "begin_index": text_index[0], "end_index": text_index[1]} result_list.append(d) return result_list def test_csv(_path): df = pd.read_csv(_path) predict_list = [] for index, row in df.iterrows(): word_list = re_channel_103(row["doctextcon"], "") if word_list: predict = word_list else: predict = [] print("predict", predict) predict_list.append(str(predict)) predict_df = pd.DataFrame(predict_list) df = pd.concat([df, predict_df], axis=1) df.to_csv(_path) print("finish write!") def test_str(): s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区' s = ''' (第1次澄清) 发布时间:2020-11-25 致各招标文件持有者: 招标人──舟山市 ''' print(extract_channel_103(s)) def test_html(): html_path = "C:/Users/Administrator/Desktop/3.html" with open(html_path, "r") as f: s = f.read() print(extract_channel_103(s, title="")) if __name__ == "__main__": path = "D:\\BIDI_DOC\\比地_文档\\澄清答疑_result.csv" # test_csv(path) test_str() # test_html(path) pass