123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229 |
- import ast
- import pandas as pd
- import re
- from BiddingKG.dl.interface import Entitys
- def re_bidway(text):
- # df = pd.read_csv("C:\\Users\\admin\\Desktop\\bidway_text.csv")
- reg = re.compile(u'(采购方式|竞价方式|招标方式|询价类型|交易方式|寻源策略|招标形式|询价方式'
- u'|发包方式|发包类型|开展方式|招标类型)(.*)'
- u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
- u'|电子书面竞投|邀请招标|定向公开|询价采购|抽签摇号'
- u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
- u'|网上招标|其他'
- u'|竞谈竞价|网上直购|公开竞谈'
- u'|库内邀请|库内公开发包)')
- # reg = re.compile(u'(公开招标|竞争性磋商|竞争性谈判采购|公开采购|单一来源|网络竞价'
- # u'|竞争性谈判|公开询价|邀请招标|公开招募|公开询比价|电子书面竞投'
- # u'|网上电子投标|比质比价|定向询单|国内比选|电子竞价'
- # u'|公开招租|公开竞标方式|网上招标|公开招标|国内竞争性谈判'
- # u'|国内竞争性磋商|公开竞谈|定向询价|网上询价|网上竞价|公开比选|磋商采购|网上直购'
- # u'|库内邀请|询价采购|询比采购|分散采购|单一来源采购)')
- reg2 = re.compile(u'(采用|以|)'
- u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
- u'|竞争性谈判|询价|电子书面竞投|电子竞价'
- u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
- u'|网上招标|分散采购'
- u'|竞谈竞价|网上直购|公开竞谈'
- u'|库内邀请)'
- u'(采购方式|方式)')
- reg1 = re.compile(
- # u'(公开招标|竞争性磋商|竞争性谈判采购|公开采购|单一来源采购|网络竞价|公开招商方式'
- # u'|竞争性谈判|公开询价|询价采购|邀请招标|公开招募|公开询比|电子书面竞投'
- # u'|网上电子投标|比质比价|定向询单|询比采购|国内比选|单一来源|公开选取|库内公开发包'
- # u'|公开招租|公开竞标方式|网上招标|公开招标|竞争性谈判|公开招投标'
- # u'|国内竞争性磋商|公开竞谈|定向询价|网上询价|网上竞价|公开比选|磋商采购|网上直购'
- # u'|国际公开竞争性招标)'
- u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
- u'|竞争性谈判|询价|电子书面竞投'
- u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
- u'|网上招标|分散采购'
- u'|竞谈竞价|网上直购|公开竞谈'
- u'|库内邀请)'
- )
- # 都切为4个字符
- # reg1_not = re.compile(u'(及单一来源|询价小组成员|除单一来源|竞争性谈判邀请函|询价记录)')
- reg1_not = re.compile(u'(及单一来|价小组成|除单一来|性谈判邀|询价记录)')
- reg3 = re.compile(u'(采购方式:邀请|采购方式:公开|采购方式:询价|分散采购|公开招标|竞价|磋商|询比|竞标|邀请招标|公开招募|公开招租)')
- reg_standard = re.compile(u'(公开招标|竞争性磋商|竞争性谈判|单一来源'
- u'|竞争性谈判|询价|邀请招标|公开招募|询比|电子书面竞投'
- u'|网上电子投标|比质比价|询单|比选'
- u'|公开招租|网上招标|分散采购'
- u'|网上直购|公开竞谈|采购方式:邀请|采购方式:公开|采购方式:询价)'
- )
- # text_list = df["text"].to_list()
- text_list = []
- text_list.append(text)
- text_index_list = []
- output_list = []
- for index in range(len(text_list)):
- # 全文下标
- text_index = [0, 0]
- input_str = text_list[index]
- # 把一些混淆的词先替换掉
- input_str = re.sub(reg1_not, "####", input_str)
- match = reg.search(input_str)
- output_str = None
- # 根据正则表达式匹配
- if match:
- # 更新全文下标
- text_index[0] = match.start()
- text_index[1] = match.end()
- # 判断长度,截断
- if len(match.group()) >= 15:
- ss = re.split(",|\.|,|。|;|;", match.group())
- # 判断所需的字符串在哪一段
- for i in range(len(ss)):
- if re.search(reg1, ss[i]):
- output_str = ss[i]
- # 更新全文下标
- front_len, back_len = calculateLen(ss, i)
- text_index[0] = text_index[0] + front_len + i
- text_index[1] = text_index[1] - back_len + len(ss) -1 - i
- break
- else:
- output_str = match.group()
- else:
- match2 = re.search(reg2, input_str)
- if match2:
- # 更新全文下标
- text_index[0] = match2.start()
- text_index[1] = match2.end()
- output_str = match2.group()
- else:
- match1 = re.search(reg1, input_str)
- if match1:
- # 更新全文下标
- text_index[0] = match1.start()
- text_index[1] = match1.end()
- output_str = match1.group()
- # 再判断一次长度
- if output_str is not None:
- if len(output_str) >= 15:
- match2 = re.search(reg2, input_str)
- if match2:
- # 更新全文下标
- text_index[0] = match2.start()
- text_index[1] = match2.end()
- output_str = match2.group()
- if len(output_str) >= 15:
- match1 = re.search(reg1, input_str)
- if match1:
- # 更新全文下标
- text_index[0] = match1.start()
- text_index[1] = match1.end()
- output_str = match1.group()
- # 最后输出还为空,匹配一些易混淆的词
- if output_str is None:
- match3 = re.search(reg3, input_str)
- if match3:
- # 更新全文下标
- text_index[0] = match3.start()
- text_index[1] = match3.end()
- output_str = match3.group()
- # 处理前缀等无用词
- if output_str is not None:
- match5 = re.search("分散采购|采购方式:邀请", output_str)
- if not match5:
- # 公开采购转为公开招标
- output_str = re.sub("公开采购", "公开招标", output_str)
- # 去掉第一个字符冒号
- ss = re.split(":|:", output_str)
- output_str = ss[-1]
- # 更新全文下标
- front_len, back_len = calculateLen(ss, len(ss) - 1)
- text_index[0] = text_index[0] + front_len + len(ss) - 1
- # 去掉采购、方式、采用
- match6 = re.search("(采用|出售|直接(|现就本次|招标为)", output_str)
- match7 = re.search("(采购|方式|进行)", output_str)
- output_str = re.sub("(采购|方式|采用|出售|进行|直接(|现就本次|招标为)", "", output_str)
- # 更新全文下标
- if match6:
- text_index[0] += match6.end() - match6.start()
- if match7:
- text_index[1] -= match7.end() - match7.start()
- # 使用标准标签过滤
- match4 = re.search(reg_standard, output_str)
- if match4:
- output_str = match4.group()
- # 更新全文下标
- text_index[0] += match4.start()
- text_index[1] = text_index[0] + match4.end() - match4.start()
- output_list.append(output_str)
- # text_index_list.append(str(text_index))
- text_index_list.append(text_index)
- # df["re"] = pd.DataFrame(output_list)
- # df["text_index"] = pd.DataFrame(text_index_list)
- # index_to_word = []
- # for index, row in df.iterrows():
- # i_list = ast.literal_eval(row["text_index"])
- # word = row["text"][i_list[0]:i_list[1]]
- # if len(word) >= 20:
- # word = ""
- # index_to_word.append(word)
- # df["index2word"] = pd.DataFrame(index_to_word)
- # df.to_csv("C:\\Users\\admin\\Desktop\\bidway_text2.csv")
- return output_list[0], text_index_list[0]
- def calculateLen(ss, i):
- front_len = 0
- back_len = 0
- for index in range(i):
- front_len += len(ss[index])
- for index in range(i+1, len(ss)):
- back_len += len(ss[index])
- return front_len, back_len
- def extract_bidway(text):
- list_bidway = []
- word, text_index_list = re_bidway(text)
- if word is not None:
- d = {"body": word, "begin_index": text_index_list[0], "end_index": text_index_list[1]}
- list_bidway.append(d)
- # print(d)
- return list_bidway
- if __name__ == "__main__":
- # df = pd.read_csv("C:\\Users\\admin\\Desktop\\bidway_text.csv")
- # s = df["text"].iloc[1]
- s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
- extract_bidway(s)
|