123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325 |
- #coding:UTF-8
- import re
- import pandas as pd
- from bs4 import BeautifulSoup
- TEST_MODE = False
- # before = '(?P<before>' \
- # '合同期限|工期/交货期/服务期|工期,\(日历天\)|工期\(交货期\)|合格工期\(天\)|服务期限\(年\)|工期\(天\)' \
- # '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
- # '|合格工期|计划工期\(服务期\)|服务期\(日历天\)|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
- # '|交货时间|工期\(日历天\)' \
- # '|服务期限为|计划工期|工期要求|服务期限|服务期' \
- # '|投标工期|设计工期|合格服务周期|总工期|服务时间|流转期限|维护期限|服务时限|交货期|服务要求' \
- # '|完成时间|服务期限|中标工期|项目周期|期限要求|周期|工期|供货期|合同履行日期|计划周期|工期' \
- # ')'
- before = '(?P<before>' \
- '合同期限|工期/交货期/服务期|工期,|工期\(交货期\)|合格工期|服务期限|工期' \
- '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
- '|合格工期|计划工期\(服务期\)|服务期|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
- '|交货时间|工期|质保期' \
- '|服务期限为|计划工期|工期要求|服务期限|服务期' \
- '|投标工期|设计工期|合格服务周期|总工期|服务时间|流转期限|维护期限|服务时限|交货期' \
- '|完成时间|服务期限|中标工期|项目周期|期限要求|周期|供货期|合同履行日期|计划周期' \
- '|履约期限|合同约定完成时限|合同完成日期' \
- ')'
- # ^(?!.*abc).*$ 排除abc字符串
- before_wuye = '(?P<before>' \
- '(履约期限、地点等简要信息[::])|(履约期限、地点等简要信息.{0,25}(?= [\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+([年月日]|个月)|20[21]))' \
- ')'
- before2 = '(?P<before2>' \
- '自合同签订之日起至|合同签订之日起|自合同签订之日起|开工后|不超过|签订合同后|系统开发' \
- '|合同签订之日起至|自合同签订之日|合同签定后|自签订合同之日起|自合同签订起' \
- '|[自从]?合同签[订定]生效之日起|自合同签订后不超过|中选后|均为|合同签订日至|合同期' \
- '|.{0,1}合同签订.{0,3}|计划|从|合同签订生效之日起|本项目招标有效期' \
- '|[自从]?签[订定]合同(之日|后).{1,4}|[自从]?(采购)?合同签[订定](之日|后|).{1,5}|签订合同起' \
- '|项目的有效期限为|项目服务为|签订合同期为|合同签[订定]生效之日.{1,4}' \
- '|[自从]服务合同生效之日.{1,4}|[自从].{2,15}之日.{1,4}|(本次)?采购周期' \
- '|(项目招标)?履行期|[自从于]?合同生效之日.{1,3}|' \
- ')'
- before3 = '(?P<before3>' \
- '([\((]日历天[\))]|[\((]天[\))]|[\((]年[\))]|[\((]月[\))])?' \
- ')'
- charac = '(?P<charac>' \
- '[::,,【()】为是起暂定的有效期限]*' \
- ')'
- center = '(?P<center>' \
- '[自为约是起暂定的拟期从]{0,3}(\d{2,4}[-.年/]?\d{1,2}[-.月/]?\d{1,2}[日号]?[-~~起至—]+(\d{2,4}[-.年/]?)?\d{1,2}[-.月/]?\d{1,2}[日号]?|\d{2,4}年\d{1,2}月\d{1,2}[日号]|[\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+)' \
- ')'
- number = '(?P<number>' \
- '[\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+' \
- ')'
- after = '(?P<after>' \
- '周年|周|号|天|个月|个年|年|个日历天|日历天|日|\(日历天\)|\(天\)|周内|,日历天|工作日|个工作日|' \
- ')'
- after1 = '(?P<after1>' \
- '[自为约是起暂定的拟从]{0,3}\d{4}[-年/]?(\d{1,2}[-月/]?)?(\d{1,2}[日号]?)?[-~~起至—]+(\d{4}[-年/]?)?(\d{1,2}[-月/]?)?(\d{1,2}日?)?(-\d{1,2}[日号]?)?([】)]?)' \
- ')'
- after2 = '(?P<after2>' \
- '\d+' \
- ')'
- after3 = '(?P<after3>' \
- '(.{0,25}止([\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾][年月日])?)' \
- ')'
- reg = re.compile(before + before3 + charac + before2 + center + after)
- reg1 = re.compile(before + before3 + charac + after3)
- reg2 = re.compile(before + before3 + charac + before2 + after1)
- reg3 = re.compile(before + before3 + charac + before2 + after2)
- reg4 = re.compile(before2[:-2]+before2[-1:] + center + after)
- # reg4 = re.compile(before2[:-2]+before2[-1:] + number + after)
- # print(before2[:-2]+before2[-1:])
- reg_wuye = re.compile(before_wuye + center + after)
- reg_not = re.compile(u'(工期延误|工期节点|工期管理|交付使用'
- u'|工期、)'
- u'|工期情况|划工期内|服务期内')
- reg_not1 = re.compile(u'(履行日期:见|服务期限应按|签订合同前,'
- u'|务期限:1、|签订日期|证金在合同签|服务期限截止'
- u')')
- # reg_not2 = re.compile(u'(截止|1\\.|1、)')
- reg_not2 = re.compile(u'(截止)')
- reg_right_digit = re.compile(u'[\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+')
- reg_right_unit = re.compile(u'[-.年月日号天~~至—]')
- reg_error = re.compile(u'公告|发布|中')
- def re_serviceTime(text):
- if TEST_MODE:
- # print(chardet.detect(text))
- text = re.sub("\s*", "", text)
- text_list = []
- text_list.append(text)
- # 初始化
- all_output_list = []
- all_text_index_list = []
- for index in range(len(text_list)):
- # 初始化
- output_list = []
- input_str = text_list[index]
- # 替换混淆词
- for _reg_not in [reg_not, reg_not1, reg_not2]:
- match_iter = re.finditer(_reg_not, input_str)
- for match in match_iter:
- word_index = match.span()
- word = match.group()
- instead = "#" * len(word)
- print("word, instead, word_index", word, instead, word_index)
- input_str = input_str[:word_index[0]] + instead + input_str[word_index[1]:]
- if TEST_MODE:
- print("input_str", input_str)
- # 匹配
- output_list, text_index_list = re_findAllResult(reg2, input_str)
- if TEST_MODE:
- print("output_str, text_index reg2", output_list, text_index_list)
- if len(output_list) == 0:
- output_list, text_index_list = re_findAllResult(reg, input_str)
- if TEST_MODE:
- print("output_str, text_index reg", output_list, text_index_list)
- if len(output_list) == 0:
- output_list, text_index_list = re_findAllResult(reg1, input_str)
- if TEST_MODE:
- print("output_str, text_index reg1", output_list, text_index_list)
- if len(output_list) == 0:
- output_list, text_index_list = re_findAllResult(reg3, input_str)
- if TEST_MODE:
- print("output_str, text_index reg3", output_list, text_index_list)
- if len(output_list) == 0:
- output_list, text_index_list = re_findAllResult(reg4, input_str)
- if TEST_MODE:
- print("output_str, text_index reg4", output_list, text_index_list)
- if len(output_list) == 0:
- output_list, text_index_list = re_findAllResult(reg_wuye, input_str)
- if TEST_MODE:
- print("output_str, text_index reg_wuye", output_list, text_index_list)
- # 过滤
- delete_list = []
- for i in range(len(output_list)):
- output = output_list[i]
- # 不包含数字、单位的
- if not re.findall(reg_right_digit, output):
- delete_list.append([output, text_index_list[i]])
- continue
- if not re.findall(reg_right_unit, output):
- delete_list.append([output, text_index_list[i]])
- continue
- # 包含不要的字
- if re.findall(reg_error, output):
- delete_list.append([output, text_index_list[i]])
- continue
- # 类似2019年的
- if not re.findall("[月日天号]", output):
- if len(re.findall("年", output)) == 1:
- year_time = re.search("\d+", output)
- if year_time is not None and int(year_time.group()) >= 2000:
- print("delete output", output)
- delete_list.append([output, text_index_list[i]])
- for output, text_index in delete_list:
- if output in output_list:
- output_list.remove(output)
- if text_index in text_index_list:
- text_index_list.remove(text_index)
- # 添加
- all_output_list += output_list
- all_text_index_list += text_index_list
- index2word = []
- for i in range(len(all_text_index_list)):
- word = text[all_text_index_list[i][0]:all_text_index_list[i][1]]
- if i != len(all_text_index_list)-1:
- word = word + " "
- index2word.append(word)
- if TEST_MODE:
- print("index2word all_text_index_list", index2word, all_text_index_list)
- return index2word, all_text_index_list
- def re_findAllResult(reg, input, unit="", index=0):
- """
- :param reg: 正则表达式
- :param input: 待匹配句子
- :param unit: 需要加的单位
- :param index: 字符串拼接的开始位置
- :return: 正则后的字符串
- """
- # 全文下标
- text_index = []
- match1 = re.finditer(reg, input)
- output_list = []
- for i in match1:
- output = ""
- d = i.groupdict()
- if d.get("before"):
- output += d.get("before")
- if d.get("before3"):
- output += d.get("before3")
- if d.get("charac"):
- output += d.get("charac")
- if d.get("before2"):
- output += d.get("before2")
- if d.get("center"):
- output += d.get("center")
- if d.get("number"):
- output += d.get("number")
- if d.get("after"):
- output += d.get("after")
- if d.get("after1"):
- output += d.get("after1")
- if d.get("after2"):
- output += d.get("after2")
- if d.get("after3"):
- output += d.get("after3")
- if d.get("before") is not None:
- if d.get("before3") is None or d.get("before3") != "":
- front_len = len(d.get("before"))
- # print("1-", len(d.get("before")))
- else:
- front_len = len(d.get("before")) + len(d.get("charac"))
- # print("2-", len(d.get("before")), len(d.get("charac")))
- if d.get("before2") is not None:
- front_len += len(d.get("before2"))
- else:
- front_len = 0
- text_index.append([i.start()+front_len, i.end()])
- output_list.append(output)
- return output_list, text_index
- def calculateLen(ss, i):
- front_len = 0
- back_len = 0
- for index in range(i):
- front_len += len(ss[index])
- for index in range(i+1, len(ss)):
- back_len += len(ss[index])
- return front_len, back_len
- def extract_servicetime(text):
- list_servicetime = []
- word_list, text_index_list = re_serviceTime(text)
- # print(word, text_index_list)
- for i in range(len(text_index_list)):
- d = {"body": word_list[i], "begin_index": text_index_list[i][0], "end_index": text_index_list[i][1]}
- if len(word_list[i]) <= 35:
- list_servicetime.append(d)
- if TEST_MODE:
- print("list_servicetime", list_servicetime)
- return list_servicetime
- def test_from_str():
- # s = """
- # 青岛市即墨区新兴中学物业管理服务项目 信息公开 合同公告 一、合同编号:D202101110008 二、合同名称:物业管理服务项目 三、项目编码(或招标编号、政府采购计划编号、采购计划备案文号等,如有):D202101110008 四、项目名称:物业管理服务项目 五、合同主体 采购人(甲方):青岛市即墨区新兴中学 地址:新兴路288号 联系方式:0532-88509712 供应商(乙方):青岛安之信物业管理有限公司 地址:山东省青岛市即墨区振华街87号恒生源大厦5楼 联系方式:0532-88510757 15966863456 六、合同主要信息 主要标的名称:物业管理服务 规格型号(或服务要求):1、所有上岗人员均要求符合相应年龄、性别及学历标准,身体健康,品德端正,无任何违法犯罪记录。 2、门卫需24小时在岗等。 3、卫生保洁人员:负责学校公共区域卫生保洁等。 4、校园绿化人员:根据季节要求,规范养护校园植物等。 5、公共设施设备维修维护:维修 主要标的数量:1.0 主要标的单价:277200.0 合同金额:27.72 万元 履约期限、地点等简要信息:2021-01-31、即墨区新兴中学 采购方式:网上超市 七、合同签订日期:2021-01-11 八、合同公告日期:2021-01-11 九、其他补充事宜: 附件: 『查看附件』 发 布 人:青岛市即墨区新兴中学 发布时间: 2021年1月11日
- # """
- s = " 服务周期/到货期:2022年6月1日-2022年12月31日。 "
- print(extract_servicetime(s))
- def test_from_csv():
- df = pd.read_csv("D:/BIDI_DOC/招标方式_服务期限_提取/serviceTime_text.csv")
- result_list = []
- for index, row in df.iterrows():
- result = extract_servicetime(row["text"])
- result_list.append(str(result))
- df["new_word"] = pd.DataFrame(result_list)
- df.to_csv("D:/BIDI_DOC/招标方式_服务期限_提取/serviceTime_text_new.csv")
- def test_from_xlsx():
- df = pd.read_excel("D:/BIDI_DOC/比地_文档/service_time_error.xlsx")
- result_list = []
- for index, row in df.iterrows():
- text = row["dochtmlcon"]
- soup = BeautifulSoup(text, "lxml")
- text = soup.get_text(strip=True)
- result = extract_servicetime(text)
- result_list.append(str(result))
- df["new_word"] = pd.DataFrame(result_list)
- df.to_excel("D:/BIDI_DOC/比地_文档/service_time_error_new.xlsx", index=False)
- if __name__ == '__main__':
- test_from_str()
|