123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219 |
- import pandas as pd
- import re
- # 各投标人
- # 各潜在投标人
- # 各潜在投标人:
- # 致各招标文件持有者:
- # 致各投标人
- # 各潜在投标供应商:
- # 修改、澄清(答疑)纪要内容如下: 1、
- # 答疑澄清与修改的主要内容:
- # 对文件澄清与修改的主要内容
- # 澄清、修改内容要点
- # 答疑纪要
- # 答疑如下
- # 招标文件答疑和招标文件修改通知
- # 招标文件答疑通知
- # 答疑及补遗通知
- # 答疑回复如下:
- # 现对投标人提出的质疑回复如下:
- # 对文件澄清与修改的主要内容 详见招标文件
- # 修改的主要内容 详见附件
- # 澄清或修改事项:
- # 第1次答疑
- # 第1次答疑澄清
- # 答疑补遗文件
- # 补遗书澄清文件 答疑澄清
- # 质疑1
- # 问题
- # 答疑文件1
- # 具体补遗内容详见附件
- # 请问 答
- # 问题 回复
- # 答疑澄清公告 1:
- # 现对招标文件作如下澄清:
- # 详见答疑澄清文件
- # 详见答疑文件。
- channel_103 = '(澄清|答疑|补遗|修改)'
- channel_103_0 = '(致|至|)(各|各个)(潜在|)(投标|招标|招标文件持有|报价|竞选|)(人|者|供应商|单位)(:|:)'
- channel_103_1 = '(澄清|答疑|补遗|修改|质疑)(.?)(具体内容|主要内容|内容|回复|发布|纪要|事项|如下){1,2}(.?)' \
- '(如下|[::]|详见|点击下载附件|[1一][::、]|(1)|\\(1\\)|一)'
- channel_103_2 = '第(.?)次(答疑|澄清)'
- channel_103_3 = '(澄清|答疑|补遗|修改)(公告|文件)'
- channel_103_after = '(请问|提问|问题|答复|回复|质疑|答|问){1,2}[12一]?[::]|[一1][::、]|(1)|\\(1\\)|(详见|见)(附件|答疑文件|澄清文件|答疑澄清文件)'
- channel_103_4 = '(补充答疑|提疑内容|请问|提问|问题|回复|答复|答疑|质疑|答|问)[12一]?[::]'
- channel_103_5 = '(见|详见)(答疑澄清文件|澄清文件|答疑文件)|补遗内容详见附件'
- # 答疑澄清时间
- # 对文件澄清与修改的主要内容 无澄清文件
- # 对文件澄清与修改的主要内容 无
- # 请各投标单位自行下载
- not_channel_103 = '答疑澄清时间|主要内容.?无|请各投标单位'
- def re_standard_channel_103(_str):
- channel_103_list = []
- if not re.search(channel_103, _str):
- print("not")
- return channel_103_list
- reg_standard = "(?P<value>" + channel_103_0 + ")"
- match = re.finditer(reg_standard, _str)
- for m in match:
- m_dict = m.groupdict()
- m_span = m.span()
- keyword_index = [m_span[0], m_span[1]]
- keyword = m_dict.get('value')
- channel_103_list.append([keyword, keyword_index])
- if channel_103_list:
- print("0", channel_103_list)
- return channel_103_list
- reg_standard = "(?P<value>" + channel_103_1 + ")"
- match = re.finditer(reg_standard, _str)
- for m in match:
- m_dict = m.groupdict()
- m_span = m.span()
- keyword_index = [m_span[0], m_span[1]]
- keyword = m_dict.get('value')
- channel_103_list.append([keyword, keyword_index])
- if channel_103_list:
- print("1", channel_103_list)
- return channel_103_list
- reg_standard = "(?P<value>" + channel_103_2 + ")"
- match = re.finditer(reg_standard, _str)
- for m in match:
- m_dict = m.groupdict()
- m_span = m.span()
- keyword_index = [m_span[0], m_span[1]]
- keyword = m_dict.get('value')
- if re.search(channel_103_after, _str[keyword_index[1]:keyword_index[1]+50]):
- channel_103_list.append([keyword, keyword_index])
- if channel_103_list:
- print("2", channel_103_list)
- return channel_103_list
- reg_standard = "(?P<value>" + channel_103_3 + ")"
- match = re.finditer(reg_standard, _str)
- for m in match:
- m_dict = m.groupdict()
- m_span = m.span()
- keyword_index = [m_span[0], m_span[1]]
- keyword = m_dict.get('value')
- if re.search(channel_103_after, _str[keyword_index[1]:keyword_index[1]+50]):
- channel_103_list.append([keyword, keyword_index])
- if channel_103_list:
- print("3", channel_103_list)
- return channel_103_list
- reg_standard = "(?P<value>" + channel_103_4 + ")"
- match = re.finditer(reg_standard, _str)
- for m in match:
- m_dict = m.groupdict()
- m_span = m.span()
- keyword_index = [m_span[0], m_span[1]]
- keyword = m_dict.get('value')
- channel_103_list.append([keyword, keyword_index])
- if channel_103_list:
- print("4", channel_103_list)
- return channel_103_list
- reg_standard = "(?P<value>" + channel_103_5 + ")"
- match = re.finditer(reg_standard, _str)
- for m in match:
- m_dict = m.groupdict()
- m_span = m.span()
- keyword_index = [m_span[0], m_span[1]]
- keyword = m_dict.get('value')
- channel_103_list.append([keyword, keyword_index])
- if channel_103_list:
- print("5", channel_103_list)
- return channel_103_list
- return channel_103_list
- def re_not_channel_103(_str):
- match = re.findall(not_channel_103, _str)
- if match:
- for word in match:
- instead = "#" * len(word)
- _str = re.sub(word, instead, _str)
- return _str
- def re_channel_103(text):
- # 替换易混淆词
- clean_text = re_not_channel_103(text)
- # 查找符合标准形式的
- channel_103_list = re_standard_channel_103(clean_text)
- return channel_103_list
- def extract_channel_103(text):
- result_list = []
- channel_103_list = re_channel_103(text)
- if channel_103_list:
- for word, text_index in channel_103_list:
- if word is not None:
- if text_index[1]-text_index[0] != len(word) \
- or text_index[1]-text_index[0] >= 20:
- return []
- d = {"body": word, "begin_index": text_index[0], "end_index": text_index[1]}
- result_list.append(d)
- return result_list
- def test_csv(_path):
- df = pd.read_csv(_path)
- predict_list = []
- for index, row in df.iterrows():
- word_list = re_channel_103(row["doctextcon"], "")
- if word_list:
- predict = word_list
- else:
- predict = []
- print("predict", predict)
- predict_list.append(str(predict))
- predict_df = pd.DataFrame(predict_list)
- df = pd.concat([df, predict_df], axis=1)
- df.to_csv(_path)
- print("finish write!")
- def test_str():
- s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
- s = '''
- (第1次澄清) 发布时间:2020-11-25 致各招标文件持有者: 招标人──舟山市
- '''
- print(extract_channel_103(s))
- def test_html():
- html_path = "C:/Users/Administrator/Desktop/3.html"
- with open(html_path, "r") as f:
- s = f.read()
- print(extract_channel_103(s, title=""))
- if __name__ == "__main__":
- path = "D:\\BIDI_DOC\\比地_文档\\澄清答疑_result.csv"
- # test_csv(path)
- test_str()
- # test_html(path)
- pass
|