123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474 |
- import ast
- import pandas as pd
- import re
- # from BiddingKG.dl.interface import Entitys
- # def re_bidway_old(text):
- # df = pd.read_csv("C:\\Users\\admin\\Desktop\\bidway_text.csv")
- #
- # reg = re.compile(u'(采购方式|竞价方式|招标方式|询价类型|交易方式|寻源策略|招标形式|询价方式'
- # u'|发包方式|发包类型|开展方式|招标类型)(.*)'
- # u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
- # u'|电子书面竞投|邀请招标|定向公开|询价采购|抽签摇号'
- # u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
- # u'|网上招标|其他'
- # u'|竞谈竞价|网上直购|公开竞谈'
- # u'|库内邀请|库内公开发包)')
- #
- # # reg = re.compile(u'(公开招标|竞争性磋商|竞争性谈判采购|公开采购|单一来源|网络竞价'
- # # u'|竞争性谈判|公开询价|邀请招标|公开招募|公开询比价|电子书面竞投'
- # # u'|网上电子投标|比质比价|定向询单|国内比选|电子竞价'
- # # u'|公开招租|公开竞标方式|网上招标|公开招标|国内竞争性谈判'
- # # u'|国内竞争性磋商|公开竞谈|定向询价|网上询价|网上竞价|公开比选|磋商采购|网上直购'
- # # u'|库内邀请|询价采购|询比采购|分散采购|单一来源采购)')
- #
- # reg2 = re.compile(u'(采用|以|)'
- # u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
- # u'|竞争性谈判|询价|电子书面竞投|电子竞价'
- # u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
- # u'|网上招标|分散采购'
- # u'|竞谈竞价|网上直购|公开竞谈'
- # u'|库内邀请)'
- # u'(采购方式|方式)')
- #
- # reg1 = re.compile(
- # # u'(公开招标|竞争性磋商|竞争性谈判采购|公开采购|单一来源采购|网络竞价|公开招商方式'
- # # u'|竞争性谈判|公开询价|询价采购|邀请招标|公开招募|公开询比|电子书面竞投'
- # # u'|网上电子投标|比质比价|定向询单|询比采购|国内比选|单一来源|公开选取|库内公开发包'
- # # u'|公开招租|公开竞标方式|网上招标|公开招标|竞争性谈判|公开招投标'
- # # u'|国内竞争性磋商|公开竞谈|定向询价|网上询价|网上竞价|公开比选|磋商采购|网上直购'
- # # u'|国际公开竞争性招标)'
- # u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
- # u'|竞争性谈判|询价|电子书面竞投'
- # u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
- # u'|网上招标|分散采购'
- # u'|竞谈竞价|网上直购|公开竞谈'
- # u'|库内邀请)'
- # )
- #
- # # 都切为4个字符
- # # reg1_not = re.compile(u'(及单一来源|询价小组成员|除单一来源|竞争性谈判邀请函|询价记录)')
- # reg1_not = re.compile(u'(及单一来|价小组成|除单一来|性谈判邀|询价记录)')
- #
- # reg3 = re.compile(u'(采购方式:邀请|采购方式:公开|采购方式:询价|分散采购|公开招标|竞价|磋商|询比|竞标|邀请招标|公开招募|公开招租)')
- #
- #
- # reg_standard = re.compile(u'(公开招标|竞争性磋商|竞争性谈判|单一来源'
- # u'|竞争性谈判|询价|邀请招标|公开招募|询比|电子书面竞投'
- # u'|网上电子投标|比质比价|询单|比选'
- # u'|公开招租|网上招标|分散采购'
- # u'|网上直购|公开竞谈|采购方式:邀请|采购方式:公开|采购方式:询价)'
- # )
- #
- # text_list = df["text"].to_list()
- # # text_list = []
- # # text_list.append(text)
- # text_index_list = []
- # output_list = []
- # for index in range(len(text_list)):
- # # 全文下标
- # text_index = [0, 0]
- #
- # input_str = text_list[index]
- #
- # # 把一些混淆的词先替换掉
- # input_str = re.sub(reg1_not, "####", input_str)
- #
- # match = reg.search(input_str)
- # output_str = None
- # # 根据正则表达式匹配
- # if match:
- # # 更新全文下标
- # text_index[0] = match.start()
- # text_index[1] = match.end()
- #
- # # 判断长度,截断
- # if len(match.group()) >= 15:
- # ss = re.split(",|\.|,|。|;|;", match.group())
- # # 判断所需的字符串在哪一段
- # for i in range(len(ss)):
- # if re.search(reg1, ss[i]):
- # output_str = ss[i]
- #
- # # 更新全文下标
- # front_len, back_len = calculateLen(ss, i)
- # text_index[0] = text_index[0] + front_len + i
- # text_index[1] = text_index[1] - back_len + len(ss) -1 - i
- #
- # break
- # else:
- # output_str = match.group()
- #
- # else:
- # match2 = re.search(reg2, input_str)
- # if match2:
- # # 更新全文下标
- # text_index[0] = match2.start()
- # text_index[1] = match2.end()
- #
- # output_str = match2.group()
- #
- # else:
- # match1 = re.search(reg1, input_str)
- # if match1:
- # # 更新全文下标
- # text_index[0] = match1.start()
- # text_index[1] = match1.end()
- # output_str = match1.group()
- #
- # # 再判断一次长度
- # if output_str is not None:
- # if len(output_str) >= 15:
- # match2 = re.search(reg2, input_str)
- # if match2:
- # # 更新全文下标
- # text_index[0] = match2.start()
- # text_index[1] = match2.end()
- #
- # output_str = match2.group()
- # if len(output_str) >= 15:
- # match1 = re.search(reg1, input_str)
- # if match1:
- # # 更新全文下标
- # text_index[0] = match1.start()
- # text_index[1] = match1.end()
- #
- # output_str = match1.group()
- #
- # # 最后输出还为空,匹配一些易混淆的词
- # if output_str is None:
- # match3 = re.search(reg3, input_str)
- # if match3:
- # # 更新全文下标
- # text_index[0] = match3.start()
- # text_index[1] = match3.end()
- #
- # output_str = match3.group()
- #
- # # 处理前缀等无用词
- # if output_str is not None:
- # match5 = re.search("分散采购|采购方式:邀请", output_str)
- # if not match5:
- # # 公开采购转为公开招标
- # output_str = re.sub("公开采购", "公开招标", output_str)
- #
- # # 去掉第一个字符冒号
- # ss = re.split(":|:", output_str)
- # output_str = ss[-1]
- # # 更新全文下标
- # front_len, back_len = calculateLen(ss, len(ss) - 1)
- # text_index[0] = text_index[0] + front_len + len(ss) - 1
- #
- # # 去掉采购、方式、采用
- # match6 = re.search("(采用|出售|直接(|现就本次|招标为)", output_str)
- # match7 = re.search("(采购|方式|进行)", output_str)
- # output_str = re.sub("(采购|方式|采用|出售|进行|直接(|现就本次|招标为)", "", output_str)
- # # 更新全文下标
- # if match6:
- # text_index[0] += match6.end() - match6.start()
- # if match7:
- # text_index[1] -= match7.end() - match7.start()
- #
- # # 使用标准标签过滤
- # match4 = re.search(reg_standard, output_str)
- # if match4:
- # output_str = match4.group()
- # # 更新全文下标
- # text_index[0] += match4.start()
- # text_index[1] = text_index[0] + match4.end() - match4.start()
- #
- # output_list.append(output_str)
- # # text_index_list.append(str(text_index))
- # text_index_list.append(text_index)
- #
- # # df["re"] = pd.DataFrame(output_list)
- # # df["text_index"] = pd.DataFrame(text_index_list)
- #
- # # index_to_word = []
- # # for index, row in df.iterrows():
- # # i_list = ast.literal_eval(row["text_index"])
- # # word = row["text"][i_list[0]:i_list[1]]
- # # if len(word) >= 20:
- # # word = ""
- # # index_to_word.append(word)
- #
- #
- # # df["index2word"] = pd.DataFrame(index_to_word)
- # # df.to_csv("C:\\Users\\admin\\Desktop\\bidway_text2.csv")
- #
- # return output_list[0], text_index_list[0]
- bidway = '单一来源' \
- '|国内竞争性磋商|竞争性磋商|竞争性谈判|网络竞价|网上竞价|公开竞谈|公开竞价|电子竞价|竞价|竞标|竞谈竞价|电子书面竞投' \
- '|公开比选|比质比价|比选' \
- '|公开招标|公开招租|公开招募|公开选取|公开招投标' \
- '|网上直购|网上招标|网上电子投标|网上挂牌' \
- '|邀请招标' \
- '|网上询价|公开询价|非定向询价|定向询价|询比价|询单|询价|询比' \
- '|库内邀请|库内公开发包|内部邀标' \
- '|定点采购议价|定点采购' \
- '|竞争性评审'
- not_bidway = '及单一来源|询价小组成员|除单一来源|竞争性谈判邀请函|询价记录|自由竞价' \
- '|限时竞价|咨询单位|询价单'
- not_bidway_preffix = "本次|拟|参加|无效|标的|联合体|参与|否决|除"
- not_bidway_suffix = "文件|报名|邀请|项目|失败|数量|编号|后|时间|类型|名称|和|成交" \
- "|标题|开始|结束|产品|报价|供应商|部门|监督|需求|范围|入围|内容|人" \
- "|条件|公司|保证金|完毕|事件|成功|活动|地点|标|会|须知|范围" \
- "|响应|报价|采购公示|的原因|采购供应商|价|采购人员|失败"
- bidway_preffix = '采购方式|竞价方式|招标方式|询价类型|交易方式|寻源策略|招标形式|询价方式' \
- '|发包方式|发包类型|开展方式|招标类型|选取方式|招租方式'
- bidway_special = '采购方式:公开|采购方式:邀请|采购方式:询价' \
- '|招标方式:.公开|采购方式:.公开' \
- '|分散采购' \
- ''
- def re_not_bidway(_str):
- match = re.findall(not_bidway, _str)
- if match:
- for word in match:
- instead = "#" * len(word)
- _str = re.sub(word, instead, _str)
- reg_not1 = "(" + bidway + ")" + "(" + not_bidway_suffix + ")"
- match = re.findall(reg_not1, _str)
- if match:
- for word in match:
- word_add = ""
- for w in word:
- word_add += w
- instead = "#" * len(word_add)
- _str = re.sub(word_add, instead, _str)
- reg_not2 = "(" + not_bidway_preffix + ")" + "(" + bidway + ")"
- match = re.findall(reg_not2, _str)
- if match:
- for word in match:
- word_add = ""
- for w in word:
- word_add += w
- instead = "#" * len(word_add)
- _str = re.sub(word_add, instead, _str)
- return _str
- def re_standard_bidway(_str):
- reg_standard = "(?P<preffix>" + bidway_preffix + ")" \
- + "(?P<char>.{1,2})" \
- + "(?P<value>" + bidway + ")"
- match = re.finditer(reg_standard, _str)
- bidway_list = []
- if match:
- for m in match:
- m_dict = m.groupdict()
- m_span = m.span()
- keyword = ""
- keyword_index = [m_span[0], m_span[1]]
- for key in m_dict.keys():
- if key == "value":
- keyword = m_dict.get(key)
- else:
- keyword_index[0] += len(m_dict.get(key))
- bidway_list.append([keyword, keyword_index])
- return bidway_list
- def re_all_bidway(_str):
- reg_all = "(?P<value>" + bidway + ")"
- match = re.finditer(reg_all, _str)
- bidway_list = []
- if match:
- for m in match:
- keyword = m.group()
- keyword_index = list(m.span())
- bidway_list.append([keyword, keyword_index])
- return bidway_list
- def re_special_bidway(_str):
- reg_special = "(?P<value>" + bidway_special + ")"
- match = re.finditer(reg_special, _str)
- bidway_list = []
- if match:
- for m in match:
- keyword = m.group()
- keyword_index = list(m.span())
- bidway_list.append([keyword, keyword_index])
- return bidway_list
- def get_one_word(bidway_list):
- # 若有多个,去重,输出较长的
- word = None
- text_index = [0, 0]
- if len(bidway_list) > 1:
- word_dict = {}
- for bw in bidway_list:
- if bw[0] in word_dict.keys():
- if bw[1][0] < word_dict.get(bw[0])[0]:
- word_dict[bw[0]] = bw[1]
- else:
- word_dict[bw[0]] = bw[1]
- word_list = []
- for key in word_dict.keys():
- word_list.append([key, word_dict.get(key)[0]])
- if len(word_list) > 1:
- word_list.sort(key=lambda x: (-int(x[1]), len(x[0])))
- word = word_list[-1][0]
- text_index = word_dict.get(word)
- elif word_list:
- word = word_list[0][0]
- text_index = word_dict.get(word)
- else:
- text_index = [0, 0]
- elif len(bidway_list) == 1:
- word = bidway_list[0][0]
- text_index = bidway_list[0][1]
- return word, text_index
- def re_bidway(text, title):
- # 替换易混淆词
- text_clean = re_not_bidway(text)
- title_clean = re_not_bidway(title)
- # 查找符合标准形式的
- bidway_list = re_standard_bidway(text_clean)
- if bidway_list:
- word = bidway_list[0][0]
- text_index = bidway_list[0][1]
- return word, text_index
- # 无符合标准形式的,查找title里的所有形式
- bidway_list = re_all_bidway(title_clean)
- if bidway_list:
- word, text_index = get_one_word(bidway_list)
- return word, text_index
- # 无符合标准形式的,查找所有形式
- bidway_list = re_all_bidway(text_clean)
- if bidway_list:
- word, text_index = get_one_word(bidway_list)
- return word, text_index
- # 还无结果,查找特殊形式
- bidway_list = re_special_bidway(text_clean)
- if bidway_list:
- word = bidway_list[0][0]
- text_index = bidway_list[0][1]
- return word, text_index
- # 查无结果
- return None, [0, 0]
- def extract_bidway(text, title):
- list_bidway = []
- word, text_index_list = re_bidway(text, title)
- if word is not None:
- if text_index_list[1]-text_index_list[0] != len(word) \
- or text_index_list[1]-text_index_list[0] >= 10:
- return []
- d = {"body": word, "begin_index": text_index_list[0], "end_index": text_index_list[1]}
- list_bidway.append(d)
- # print(d.get("body"), d.get("begin_index"), d.get("end_index"))
- return list_bidway
- bidway_dict = {'询价': '询价', '竞争性谈判': '竞争性谈判',
- '公开比选': '其他', '国内竞争性磋商': '竞争性磋商',
- '招标方式:t公开': '公开招标', '竞价': '竞价',
- '竞标': '竞价', '电子竞价': '竞价',
- '电子书面竞投': '竞价', '单一来源': '单一来源',
- '网上竞价': '竞价', '公开招标': '公开招标',
- '询比': '询价', '定点采购': '其他',
- '招标方式:■公开': '公开招标', '交易其他,付款其他': '其他',
- '竞争性评审': '竞争性磋商', '公开招租': '其他', '\\N': '',
- '比选': '其他', '比质比价': '其他', '分散采购': '其他',
- '内部邀标': '邀请招标', '邀请招标': '邀请招标',
- '网上招标': '公开招标', '非定向询价': '询价',
- '网络竞价': '竞价', '公开询价': '询价',
- '定点采购议价': '其他', '询单': '询价',
- '网上挂牌': '其他', '网上直购': '其他',
- '定向询价': '询价', '采购方式:公开': '公开招标',
- '磋商': '竞争性磋商', '公开招投标': '公开招标',
- '招标方式:√公开': '公开招标', '公开选取': '公开招标',
- '网上电子投标': '公开招标', '公开竞谈': '竞争性谈判',
- '竞争性磋商': '竞争性磋商', '采购方式:邀请': '邀请招标',
- '公开竞价': '竞价', '其他': '其他', '公开招募': '其他',
- '网上询价': '询价'}
- # bidway名称统一规范
- def bidway_integrate(bidway):
- integrate_name = bidway_dict.get(bidway,"其他")
- return integrate_name
- def test_csv():
- df = pd.read_csv("C:\\Users\\Administrator\\Desktop\\bidway_text.csv")
- predict_list = []
- for index, row in df.iterrows():
- word, text_index = re_bidway(row["text"], "")
- if word:
- predict = [word, text_index]
- else:
- predict = []
- print("predict", predict)
- predict_list.append(str(predict))
- predict_df = pd.DataFrame(predict_list)
- df = pd.concat([df, predict_df], axis=1)
- df.to_csv("C:\\Users\\Administrator\\Desktop\\bidway_result.csv")
- print("finish write!")
- def test_str():
- s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
- s = '''
- ,关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知,各投标人:深圳市国际招标有限公司受中共
- 深圳市委军民融合发展委员会办公室委托,就人防工程技术咨询服务项目【重新招标】(项目编号:0658-2171
- 1A60965),进行公开招标,因投标单位不足三家,公开招标失败,现经采购单位同意,采用单一来源谈判方式
- 确定中标供应商,邀请中国建筑标准设计研究院有限公司前来谈判,一、项目编号:0658-21711A60965,二
- 、项目名称:人防工程技术咨询服务项目【重新招标】,三、凡被邀请参加谈判的供应商必须按照原招标文件第
- 六章要求制作谈判文件正本一本,副本二本,按规定的时间密封递交并参加谈判,四、谈判内容:投标价格、项
- 目实施方案、售后服务方案和其它相关事项,五、地点及时间:1、因疫情影响本项目谈判响应文件采用邮寄方
- 式接收文件,2、文件接收截止时间:2021年11月5日14:30(北京时间),3、谈判响应文件邮寄地址:深圳
- 市罗湖区嘉宾路2018号深华商业大厦裙楼6层600A。收件人:郑工,电话:18806665013,3、谈判地点:
- 线上谈判,六、谈判的相关规则按原招标文件的相应规定执行;有关谈判事宜详见招标文件第六章《公开招标失
- 败后后续采购程序和投标须知》,1、采购人信息,名称:中共深圳市委军民融合发展委员会办公室,地址:深
- 圳市福田区新洲路5008号,联系方式:刘先生,电话:0755-88100332,2、采购代理机构信息,名称:深
- 圳市国际招标有限公司,地址:罗湖总部:深圳市罗湖区嘉宾路2018号深华商业大厦裙楼6层,深圳湾总部:深
- 圳市南山区沙河西路与白石路交汇处深圳湾科技生态园9栋B4座6楼,联系方式:0755-22918634,监督举报
- 电话:0755-22965602、0755-86660475,特此通知,深圳市国际招标有限公司,2021年11月1日,更多
- 咨询报价请点击:http://zbcloud.net/bidbulletin/69495.htm,
- '''
- print(extract_bidway(s, title=""))
- def test_html():
- html_path = "C:/Users/Administrator/Desktop/3.html"
- with open(html_path, "r") as f:
- s = f.read()
- print(extract_bidway(s, title=""))
- if __name__ == "__main__":
- # extract_bidway(s)
- # test_csv()
- test_str()
- # test_html()
- pass
|