|
@@ -1,143 +1,370 @@
|
|
|
#coding:UTF-8
|
|
|
import re
|
|
|
import pandas as pd
|
|
|
+import numpy as np
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+# from sqlalchemy import create_engine
|
|
|
+
|
|
|
+TEST_MODE = False
|
|
|
+
|
|
|
+
|
|
|
+# before = '(?P<before>' \
|
|
|
+# '合同期限|工期/交货期/服务期|工期,\(日历天\)|工期\(交货期\)|合格工期\(天\)|服务期限\(年\)|工期\(天\)' \
|
|
|
+# '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
|
|
|
+# '|合格工期|计划工期\(服务期\)|服务期\(日历天\)|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
|
|
|
+# '|交货时间|工期\(日历天\)' \
|
|
|
+# '|服务期限为|计划工期|工期要求|服务期限|服务期' \
|
|
|
+# '|投标工期|设计工期|合格服务周期|总工期|服务时间|流转期限|维护期限|服务时限|交货期|服务要求' \
|
|
|
+# '|完成时间|服务期限|中标工期|项目周期|期限要求|周期|工期|供货期|合同履行日期|计划周期|工期' \
|
|
|
+# ')'
|
|
|
+
|
|
|
+before = '(?P<before>' \
|
|
|
+ '合同期限|工期/交货期/服务期|工期,|工期\(交货期\)|合格工期|服务期限|工期' \
|
|
|
+ '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
|
|
|
+ '|合格工期|计划工期\(服务期\)|服务期|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
|
|
|
+ '|交货时间|工期|质保期' \
|
|
|
+ '|服务期限为|计划工期|工期要求|服务期限|服务期' \
|
|
|
+ '|投标工期|设计工期|合格服务周期|总工期|服务时间(范围)?|流转期限|维护期限|服务时限|交货期' \
|
|
|
+ '|完成时间|服务期限|中标工期|项目周期|期限要求|周期|供货期|合同履行日期|计划周期' \
|
|
|
+ '|履约期限|合同约定完成时限|合同完成日期|承诺完成日期' \
|
|
|
+ '|合同起始日起|合同履约期|履约截止日期|承包期限|合同完成日期|特许经营期限' \
|
|
|
+ '|服务期间|服务履行期|委托(管理)?期限|经营期限' \
|
|
|
+ ')'
|
|
|
+
|
|
|
+
|
|
|
+# ^(?!.*abc).*$ 排除abc字符串
|
|
|
+before_wuye = '(?P<before>' \
|
|
|
+ '(履约期限、地点等简要信息[::]((履约|时间|期限){1,2}[::])?)' \
|
|
|
+ ')'
|
|
|
+# '|(履约期限、地点等简要信息[^\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,25})' \
|
|
|
+# (履约期限、地点等简要信息.{0,25}(?= [\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+([年月日]|个月)|20[21]))
|
|
|
+
|
|
|
+before2 = '(?P<before2>' \
|
|
|
+ '自合同签订之日起至|合同签订之日起|自合同签订之日起|签订合同后|系统开发' \
|
|
|
+ '|合同签订之日起至|自合同签订之日|合同签定后|自签订合同之日起|自合同签订起' \
|
|
|
+ '|[自从]?合同签[订定]生效之日起|自合同签订后不超过|合同签订日至' \
|
|
|
+ '|合同签订生效之日起|本项目招标有效期' \
|
|
|
+ '|[自从于]?签[订定署字](合同|协议书|协议)并?(期|开始履行|生效|有效期|约定|验收合格|期限|开始服务){0,2}(之[日后]|日期?[后起]|后|起|算|为)+[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,4}' \
|
|
|
+ '|[自从于]?(采购)?(合同|协议书|协议)(正式)?签[订定署字](完[成毕])?并?(期|开始履行|生效|有效期|约定|验收合格|期限|开始服务){0,2}(之[日后]|日期?[后起]|后|起|算|为)+[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,5}' \
|
|
|
+ '|签订合同起' \
|
|
|
+ '|项目的有效期限为|项目服务为|签订合同期为' \
|
|
|
+ '|(合同|协议书)签[订定署字]生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
|
|
|
+ '|[自从于]服务(合同|协议书|协议)生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
|
|
|
+ '|(本次)?采购周期' \
|
|
|
+ '|(项目招标)?履行期|[自从于]?(合同|协议书|协议)生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,3}' \
|
|
|
+ '|服务(有效期|年限)|本?合同有效期|(正式)?入驻(之[日后]|后|起|算)+' \
|
|
|
+ '|(合同|协议书|协议)生效(之[日后]|后|起|算)+' \
|
|
|
+ '|自?(提供服务|采购人指定|合同约定)(之[日后]|后|起|算)+' \
|
|
|
+ '|本?项目合同期(为|是)*' \
|
|
|
+ '|交付使用(之[日后]|后|起|算)+|' \
|
|
|
+ ')'
|
|
|
+ # '|[自从于].{2,15}之日[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
|
|
|
|
|
|
-def re_serviceTime(text):
|
|
|
+before3 = '(?P<before3>' \
|
|
|
+ '([\((]日历天[\))]|[\((]天[\))]|[\((]年[\))]|[\((]月[\))])?' \
|
|
|
+ ')'
|
|
|
|
|
|
- text_list = []
|
|
|
- text_list.append(text)
|
|
|
- # 初始化
|
|
|
- output_list = []
|
|
|
- text_index_list = []
|
|
|
-
|
|
|
- before = '(?P<before>' \
|
|
|
- '工期/交货期/服务期|工期,\(日历天\)|工期\(交货期\)|合格工期\(天\)|服务期限\(年\)|工期\(天\)' \
|
|
|
- '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
|
|
|
- '|合格工期|计划工期\(服务期\)|服务期\(日历天\)|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
|
|
|
- '|交货时间|工期\(日历天\)' \
|
|
|
- '|服务期限为|计划工期|工期要求|服务期限|服务期' \
|
|
|
- '|投标工期|设计工期|合格服务周期|总工期|服务时间|流转期限|维护期限|服务时限|交货期|服务要求' \
|
|
|
- '|完成时间|服务期限|中标工期|项目周期|期限要求|周期|工期|供货期|合同履行日期|计划周期|工期' \
|
|
|
- ')'
|
|
|
-
|
|
|
- before2 = '(?P<before2>' \
|
|
|
- '自合同签订之日起至|合同签订之日起|约|自合同签订之日起|开工后|不超过|签订合同后|系统开发' \
|
|
|
- '|合同签订之日起至|自合同签订之日|合同签定后|自签订合同之日起|自合同签订起' \
|
|
|
- '|自合同签订生效之日起|自合同签订后不超过|中选后|均为|合同签订日至|本项目合同期|' \
|
|
|
- ')'
|
|
|
+before4 = '(?P<before4>' \
|
|
|
+ '(履约|[本项目原则上]*一招|期限|(服务|合同)(期|)|合计|均为|开工后|不超过|中选后|计划|达到|本合同|)' \
|
|
|
+ ')'
|
|
|
|
|
|
- charac = '(?P<charac>' \
|
|
|
- '[::,,]*' \
|
|
|
- ')'
|
|
|
+charac = '(?P<charac>' \
|
|
|
+ '[::,,【()】]*' \
|
|
|
+ ')'
|
|
|
|
|
|
- center = '(?P<center>' \
|
|
|
- '[自]?\d+年\d+月\d+日至\d+年\d+月\d+日|\d+年\d+月\d+日|[\d一二三四五六七两叁贰壹肆伍]+' \
|
|
|
- ')'
|
|
|
+# charac前后、center前、after1前 需加
|
|
|
+before5 = '(?P<before5>' \
|
|
|
+ '[自为约是起暂定的拟有效期限从共计至算是要求总服务到本项目]{0,5}' \
|
|
|
+ ')'
|
|
|
+before6 = '(?P<before6>' \
|
|
|
+ '[自为约是起暂定的拟有效期限从共计至算是要求总服务到本项目]{0,5}' \
|
|
|
+ ')'
|
|
|
+before7 = '(?P<before7>' \
|
|
|
+ '[自为约是起暂定的拟有效期限从共计至算是要求总服务到本项目]{0,5}' \
|
|
|
+ ')'
|
|
|
|
|
|
- center1 = '(?P<center1>' \
|
|
|
- '[自]?\d+年\d+月\d+日至\d+年\d+月\d+日|\d+年\d+月\d+日|[\d一二三四五六七两叁贰壹肆伍]+' \
|
|
|
- ')'
|
|
|
+center = '(?P<center>' \
|
|
|
+ '(\d{2,4}[-.年/](\d{1,2}[-.月/])?(\d{0,2}[日号]?)?[-~~开始起至到—-]+(\d{2,4}[-.年/]\d{1,2}[-.月/]\d{0,2}[日号]?|\d{2,4}[-.年/]\d{1,2}[-.月/]?|\d{1,2}[-.月/]\d{1,2}[日号]?|\d{2,4}[-.年/]|\d{1,2}[-.月/]|\d{1,2}[日号]?)' \
|
|
|
+ '|\d{2,4}[-.年/]\d{1,2}[-.月/](\d{1,2}[日号]?)?' \
|
|
|
+ '|[+\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+)(\)|)' \
|
|
|
+ ')'
|
|
|
+
|
|
|
+center2 = '(?P<center2>' \
|
|
|
+ '[.\d]+个?[月年]' \
|
|
|
+ ')'
|
|
|
+
|
|
|
+number = '(?P<number>' \
|
|
|
+ '[\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+' \
|
|
|
+ ')'
|
|
|
+
|
|
|
+after = '(?P<after>' \
|
|
|
+ '[个,,(\(]*(日历|工作|学|)([年月日天周]|周年|整年)(内|)|\)|)|' \
|
|
|
+ ')'
|
|
|
+ # '|周|号|天|个月|个年|((|\(|)年()|\)|)|((|\(|)月()|\)|)|((|\(|)日()|\)|)' \
|
|
|
+ # '|个日历天|日历天|\(日历天\)|\(天\)|周内|,日历天|工作日|个工作日|' \
|
|
|
+
|
|
|
+after1 = '(?P<after1>' \
|
|
|
+ '\d{2,4}[-.年/](\d{1,2}[-.月/])?(\d{1,2}[日号])?[-~~开始起至到—]+(\d{2,4}[-.年/]\d{1,2}[-.月/]\d{0,2}[日号]?|\d{2,4}[-.年/]\d{1,2}[-.月/]?|\d{1,2}[-.月/]\d{1,2}[日号]?|\d{2,4}[-.年/]|\d{1,2}[-.月/]|\d{1,2}[日号]?)([】)]?)' \
|
|
|
+ ')'
|
|
|
+
|
|
|
+after2 = '(?P<after2>' \
|
|
|
+ '\d+' \
|
|
|
+ ')'
|
|
|
+
|
|
|
+after3 = '(?P<after3>' \
|
|
|
+ '(([\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾][年月日])?)' \
|
|
|
+ ')'
|
|
|
|
|
|
- after = '(?P<after>' \
|
|
|
- '天|个月|年|个日历天|日历天|日|\(日历天\)|\(天\)|周内|,日历天|' \
|
|
|
- ')'
|
|
|
+after4 = '(?P<after4>' \
|
|
|
+ '[^\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,25}止' \
|
|
|
+ ')'
|
|
|
|
|
|
- new1 = '(' \
|
|
|
- '\d{4}年?(\d{1,2}月?)?(\d{1,2}日?)?-(\d{4}年?)?(\d{1,2}月?)?(\d{1,2}日?)?(-\d{1,2}日?)?' \
|
|
|
- ')'
|
|
|
|
|
|
- reg = re.compile(before + charac + before2 + center + after)
|
|
|
+reg = re.compile(before + before3 + before7 + charac + before5 + before2 + before4 + before6 + center + after)
|
|
|
|
|
|
- reg1 = re.compile(before + charac + '(.*?止)')
|
|
|
+reg1 = re.compile(before + before3 + before7 + charac + before5 + after4 + after3)
|
|
|
|
|
|
- reg2 = re.compile(before + charac + before2 + new1)
|
|
|
+reg2 = re.compile(before + before3 + before7 + charac + before5 + before2 + before6 + after1)
|
|
|
|
|
|
- reg_not = re.compile(u'(工期延误|工期节点|工期管理|交付使用'
|
|
|
- u'|工期、)'
|
|
|
- u'|工期情况|划工期内|服务期内')
|
|
|
+reg3 = re.compile(before + before3 + before7 + charac + before5 + before2 + after2)
|
|
|
|
|
|
- reg_not1 = re.compile(u'(履行日期:见|服务期限应按|签订合同前,'
|
|
|
- u'|务期限:1、|同签订日期:|证金在合同签|服务期限截止'
|
|
|
- u')')
|
|
|
+reg4 = re.compile(before2[:-2]+before2[-1:] + before5 + center + after)
|
|
|
|
|
|
- reg_not2 = re.compile(u'截止|1\.|1、')
|
|
|
+reg5 = re.compile(before + before3 + before7 + charac + before5 + before2 + before4 + before6 + center2 + after)
|
|
|
+
|
|
|
+# reg4 = re.compile(before2[:-2]+before2[-1:] + number + after)
|
|
|
+# print(before2[:-2]+before2[-1:])
|
|
|
+
|
|
|
+reg_wuye = re.compile(before_wuye + before4 + before5 + center + after)
|
|
|
+
|
|
|
+reg_not = re.compile(u'(工期延误|工期节点|工期管理'
|
|
|
+ u'|工期、|终止)'
|
|
|
+ u'|工期情况|划工期内|服务期内'
|
|
|
+ u'|(\d{1,2}:\d{1,2}(:\d{1,2})?)')
|
|
|
+
|
|
|
+reg_not1 = re.compile(u'(履行日期:见|服务期限应按|签订合同前,'
|
|
|
+ u'|务期限:1、|签订日期|证金在合同签|服务期限截止'
|
|
|
+ u')')
|
|
|
+
|
|
|
+# reg_not2 = re.compile(u'(截止|1\\.|1、)')
|
|
|
+# reg_not2 = re.compile(u'')
|
|
|
+
|
|
|
+reg_right_digit = re.compile(u'[\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+')
|
|
|
+
|
|
|
+reg_right_unit = re.compile(u'[-.年月日号天~~至到—/]')
|
|
|
+
|
|
|
+reg_error = re.compile(u'公告|发布|中')
|
|
|
+
|
|
|
+
|
|
|
+def re_serviceTime(text):
|
|
|
+ if TEST_MODE:
|
|
|
+ # print(chardet.detect(text))
|
|
|
+ text = re.sub("\s*", "", text)
|
|
|
+
|
|
|
+ text_list = []
|
|
|
+ text_list.append(text)
|
|
|
+ # 初始化
|
|
|
+ all_output_list = []
|
|
|
+ all_text_index_list = []
|
|
|
|
|
|
for index in range(len(text_list)):
|
|
|
# 初始化
|
|
|
- output_str = ""
|
|
|
+ output_list = []
|
|
|
input_str = text_list[index]
|
|
|
|
|
|
# 替换混淆词
|
|
|
- # input_str = re.sub(reg_not, "####", input_str)
|
|
|
- # input_str = re.sub(reg_not1, "######", input_str)
|
|
|
- # input_str = re.sub(reg_not2, "##", input_str)
|
|
|
- for _reg_not in [reg_not,reg_not1,reg_not2]:
|
|
|
- match = re.findall(_reg_not, input_str)
|
|
|
- if match:
|
|
|
- for word in match:
|
|
|
- instead = "#" * len(word)
|
|
|
- input_str = re.sub(word, instead, input_str)
|
|
|
-
|
|
|
- output_str, text_index = re_findAllResult(reg2, input_str)
|
|
|
- if len(text_index) == 0:
|
|
|
- output_str, text_index = re_findAllResult(reg, input_str)
|
|
|
- if len(text_index) == 0:
|
|
|
- output_str, text_index = re_findAllResult(reg1, input_str)
|
|
|
+ for _reg_not in [reg_not, reg_not1]:
|
|
|
+ match_iter = re.finditer(_reg_not, input_str)
|
|
|
+ for match in match_iter:
|
|
|
+ word_index = match.span()
|
|
|
+ word = match.group()
|
|
|
+ instead = "#" * len(word)
|
|
|
+ print("word, instead, word_index", word, instead, word_index)
|
|
|
+ input_str = input_str[:word_index[0]] + instead + input_str[word_index[1]:]
|
|
|
+
|
|
|
+ if TEST_MODE:
|
|
|
+ print("input_str", input_str)
|
|
|
+
|
|
|
+ # 匹配
|
|
|
+ output_list, text_index_list = re_findAllResult(reg_wuye, input_str)
|
|
|
+ if TEST_MODE:
|
|
|
+ print("output_str, text_index reg_wuye", output_list, text_index_list)
|
|
|
+ output_list, text_index_list = filter_service_time(output_list, text_index_list)
|
|
|
+
|
|
|
+ if len(output_list) == 0:
|
|
|
+ output_list, text_index_list = re_findAllResult(reg2, input_str)
|
|
|
+ if TEST_MODE:
|
|
|
+ print("output_str, text_index reg2", output_list, text_index_list)
|
|
|
+ output_list, text_index_list = filter_service_time(output_list, text_index_list)
|
|
|
+
|
|
|
+ if len(output_list) == 0:
|
|
|
+ output_list, text_index_list = re_findAllResult(reg, input_str)
|
|
|
+ if TEST_MODE:
|
|
|
+ print("output_str, text_index reg", output_list, text_index_list)
|
|
|
+ output_list, text_index_list = filter_service_time(output_list, text_index_list)
|
|
|
+
|
|
|
+ if len(output_list) == 0:
|
|
|
+ output_list, text_index_list = re_findAllResult(reg1, input_str)
|
|
|
+ if TEST_MODE:
|
|
|
+ print("output_str, text_index reg1", output_list, text_index_list)
|
|
|
+ output_list, text_index_list = filter_service_time(output_list, text_index_list)
|
|
|
+
|
|
|
+ if len(output_list) == 0:
|
|
|
+ output_list, text_index_list = re_findAllResult(reg3, input_str)
|
|
|
+ if TEST_MODE:
|
|
|
+ print("output_str, text_index reg3", output_list, text_index_list)
|
|
|
+ output_list, text_index_list = filter_service_time(output_list, text_index_list)
|
|
|
+
|
|
|
+ if len(output_list) == 0:
|
|
|
+ output_list, text_index_list = re_findAllResult(reg4, input_str)
|
|
|
+ if TEST_MODE:
|
|
|
+ print("output_str, text_index reg4", output_list, text_index_list)
|
|
|
+ output_list, text_index_list = filter_service_time(output_list, text_index_list)
|
|
|
+
|
|
|
+ if len(output_list) == 0:
|
|
|
+ output_list, text_index_list = re_findAllResult(reg5, input_str)
|
|
|
+ if TEST_MODE:
|
|
|
+ print("output_str, text_index reg5", output_list, text_index_list)
|
|
|
+ output_list, text_index_list = filter_service_time(output_list, text_index_list)
|
|
|
|
|
|
# 添加
|
|
|
- output_list.append(output_str)
|
|
|
- text_index_list.append(text_index)
|
|
|
+ all_output_list += output_list
|
|
|
+ all_text_index_list += text_index_list
|
|
|
|
|
|
index2word = []
|
|
|
- for index in range(len(text_index_list)):
|
|
|
- word = ""
|
|
|
- for i in range(len(text_index_list[index])):
|
|
|
- word = word + text[text_index_list[index][i][0]:text_index_list[index][i][1]]
|
|
|
- if word != len(text_index_list[index])-1:
|
|
|
- word = word + " "
|
|
|
+ for i in range(len(all_text_index_list)):
|
|
|
+ word = text[all_text_index_list[i][0]:all_text_index_list[i][1]]
|
|
|
+ if i != len(all_text_index_list)-1:
|
|
|
+ word = word + " "
|
|
|
index2word.append(word)
|
|
|
|
|
|
- return index2word[0], text_index_list[0]
|
|
|
-
|
|
|
+ if TEST_MODE:
|
|
|
+ print("index2word all_text_index_list", index2word, all_text_index_list)
|
|
|
+ return index2word, all_text_index_list
|
|
|
+
|
|
|
+
|
|
|
+def filter_service_time(output_list, text_index_list):
|
|
|
+ # 过滤
|
|
|
+ delete_list = []
|
|
|
+ for i in range(len(output_list)):
|
|
|
+ output = output_list[i]
|
|
|
+
|
|
|
+ # 日期影响
|
|
|
+ if re.findall("日", output) and not re.findall(reg_right_unit, re.sub("日期", "", output)):
|
|
|
+ delete_list.append([output, text_index_list[i]])
|
|
|
+ print("delete output", output)
|
|
|
+ continue
|
|
|
+ # 不包含数字、单位的
|
|
|
+ if not re.findall(reg_right_digit, output):
|
|
|
+ delete_list.append([output, text_index_list[i]])
|
|
|
+ continue
|
|
|
+ if not re.findall(reg_right_unit, output):
|
|
|
+ delete_list.append([output, text_index_list[i]])
|
|
|
+ continue
|
|
|
+ # 包含不要的字
|
|
|
+ if re.findall(reg_error, output):
|
|
|
+ delete_list.append([output, text_index_list[i]])
|
|
|
+ continue
|
|
|
+ # 类似2019年的
|
|
|
+ if not re.findall("[-./月日天号]", output):
|
|
|
+ if len(re.findall("年", output)) == 1:
|
|
|
+ year_time = re.search("\d+", output)
|
|
|
+ if year_time is not None and int(year_time.group()) >= 2000:
|
|
|
+ delete_list.append([output, text_index_list[i]])
|
|
|
+ for output, text_index in delete_list:
|
|
|
+ if output in output_list:
|
|
|
+ output_list.remove(output)
|
|
|
+ if text_index in text_index_list:
|
|
|
+ text_index_list.remove(text_index)
|
|
|
+
|
|
|
+ if TEST_MODE:
|
|
|
+ print("delete_list", delete_list)
|
|
|
+ return output_list, text_index_list
|
|
|
|
|
|
|
|
|
def re_findAllResult(reg, input, unit="", index=0):
|
|
|
- '''
|
|
|
+ """
|
|
|
|
|
|
:param reg: 正则表达式
|
|
|
:param input: 待匹配句子
|
|
|
:param unit: 需要加的单位
|
|
|
:param index: 字符串拼接的开始位置
|
|
|
:return: 正则后的字符串
|
|
|
- '''
|
|
|
- match = re.findall(reg, input)
|
|
|
- output = ""
|
|
|
- # print(match)
|
|
|
- if match:
|
|
|
- ss = ""
|
|
|
- for i in range(len(match)):
|
|
|
- s = ""
|
|
|
- for j in range(index, len(match[i])):
|
|
|
- s = s + match[i][j]
|
|
|
- if unit != "" and j == len(match[i])-1:
|
|
|
- s = s + unit
|
|
|
- ss = ss + s
|
|
|
- if i < len(match)-1:
|
|
|
- ss = ss + " "
|
|
|
- output = ss
|
|
|
-
|
|
|
+ """
|
|
|
# 全文下标
|
|
|
text_index = []
|
|
|
match1 = re.finditer(reg, input)
|
|
|
+ output_list = []
|
|
|
for i in match1:
|
|
|
+ output = ""
|
|
|
d = i.groupdict()
|
|
|
- if d.get("before") is not None:
|
|
|
- front_len = len(d.get("before")) + len(d.get("charac"))
|
|
|
- else:
|
|
|
- front_len = 0
|
|
|
- text_index.append([i.start()+front_len, i.end()])
|
|
|
+ if d.get("before"):
|
|
|
+ output += d.get("before")
|
|
|
+ if d.get("before3"):
|
|
|
+ output += d.get("before3")
|
|
|
+ if d.get("before7"):
|
|
|
+ output += d.get("before7")
|
|
|
+ if d.get("charac"):
|
|
|
+ output += d.get("charac")
|
|
|
+ if d.get("before2"):
|
|
|
+ output += d.get("before2")
|
|
|
+ if d.get("before4"):
|
|
|
+ output += d.get("before4")
|
|
|
+ if d.get("before5"):
|
|
|
+ output += d.get("before5")
|
|
|
+ if d.get("before6"):
|
|
|
+ output += d.get("before6")
|
|
|
+ if d.get("center"):
|
|
|
+ output += d.get("center")
|
|
|
+ if d.get("number"):
|
|
|
+ output += d.get("number")
|
|
|
+ if d.get("after"):
|
|
|
+ output += d.get("after")
|
|
|
+ if d.get("after1"):
|
|
|
+ output += d.get("after1")
|
|
|
+ if d.get("after2"):
|
|
|
+ output += d.get("after2")
|
|
|
+ if d.get("after4"):
|
|
|
+ output += d.get("after4")
|
|
|
+ if d.get("after3"):
|
|
|
+ output += d.get("after3")
|
|
|
+
|
|
|
+ if TEST_MODE:
|
|
|
+ for key in d.keys():
|
|
|
+ if d.get(key):
|
|
|
+ print('d.get("' + key + '")', d.get(key))
|
|
|
+
|
|
|
+ # if d.get("before") is not None:
|
|
|
+ # if d.get("before3") is None or d.get("before3") != "":
|
|
|
+ # front_len = len(d.get("before"))
|
|
|
+ # # print("1-", len(d.get("before")))
|
|
|
+ # else:
|
|
|
+ # front_len = len(d.get("before")) + len(d.get("charac"))
|
|
|
+ # # print("2-", len(d.get("before")), len(d.get("charac")))
|
|
|
+ # if d.get("before2") is not None:
|
|
|
+ # front_len += len(d.get("before2"))
|
|
|
+ # if d.get("before4") is not None:
|
|
|
+ # front_len += len(d.get("before4"))
|
|
|
+ # else:
|
|
|
+ # if d.get("before2") is not None:
|
|
|
+ # front_len = len(d.get("before2"))
|
|
|
+ # else:
|
|
|
+ # front_len = 0
|
|
|
+
|
|
|
+ front_len = 0
|
|
|
+ for key in d.keys():
|
|
|
+ if d.get(key) and key in ["before", "before2", "before4",
|
|
|
+ "before5", "before6", "before7", "charac",
|
|
|
+ "after4"]:
|
|
|
+ front_len += len(d.get(key))
|
|
|
+ # 特殊情况
|
|
|
+ if d.get("before3"):
|
|
|
+ front_len -= len(d.get("before7"))
|
|
|
+ front_len -= len(d.get("charac"))
|
|
|
|
|
|
- return output, text_index
|
|
|
+ text_index.append([i.start()+front_len, i.end()])
|
|
|
+ output_list.append(input[i.start()+front_len: i.end()])
|
|
|
+ return output_list, text_index
|
|
|
|
|
|
|
|
|
def calculateLen(ss, i):
|
|
@@ -152,54 +379,70 @@ def calculateLen(ss, i):
|
|
|
|
|
|
def extract_servicetime(text):
|
|
|
list_servicetime = []
|
|
|
- word, text_index_list = re_serviceTime(text)
|
|
|
+ word_list, text_index_list = re_serviceTime(text)
|
|
|
# print(word, text_index_list)
|
|
|
for i in range(len(text_index_list)):
|
|
|
- word_list = word.split(" ")
|
|
|
d = {"body": word_list[i], "begin_index": text_index_list[i][0], "end_index": text_index_list[i][1]}
|
|
|
if len(word_list[i]) <= 35:
|
|
|
list_servicetime.append(d)
|
|
|
- # print(list_servicetime)
|
|
|
-
|
|
|
+ if TEST_MODE:
|
|
|
+ print("list_servicetime", list_servicetime)
|
|
|
return list_servicetime
|
|
|
|
|
|
|
|
|
-if __name__ == '__main__':
|
|
|
- s = """
|
|
|
- 4、其他:无
|
|
|
-
|
|
|
-合同履行期限:双方签订合同后30天内
|
|
|
-
|
|
|
-本项目( 不接受 )联合体投标。
|
|
|
-
|
|
|
-二、申请人的资格要求:
|
|
|
-
|
|
|
-1.满足《中华人民共和国政府采购法》第二十二条规定;
|
|
|
-
|
|
|
-2.落实政府采购政策需满足的资格要求:
|
|
|
-
|
|
|
-2.1具备《政府采购法》第二十二条规定的条件,且提供以下证明文件:
|
|
|
-
|
|
|
-(1)在中华人民共和国境内注册的法人或其他组织或自然人, 投标(响应)时提交有效的营业执照(或事业法人登记证或身份证等相关证明)副本复印件。
|
|
|
-
|
|
|
-(2)有依法缴纳税收和社会保障资金的良好记录:提供投标截止日前6个月内任意1个月依法缴纳税收和社会保障资金的相关材料。如依法免税或不需要缴纳社会保障资金的,提供相应证明材料。
|
|
|
-
|
|
|
-(3)供应商必须具有良好的商业信誉和健全的财务会计制度(提供2020年度财务状况报告或基本开户行出具的资信证明)。
|
|
|
-
|
|
|
-(4)履行合同所必需的设备和专业技术能力的证明材料或书面声明。
|
|
|
-
|
|
|
-(5)参加政府采购活动前 3 年内在经营活动中没有重大违法记录的书面声明。
|
|
|
-
|
|
|
-(6)信用记录:供应商未被列入“信用中国”网站(www.creditchina.gov.cn)“记录失信被执行人或重大税收违法案件当事人名单”记录名单; 不处于中国政府采购网(www.ccgp.gov.cn)“政府采购严重违法失信行为信息记录”中的禁止参加政府采购活动期间。
|
|
|
- """
|
|
|
- # s = "自合同签订之日起至2022-6-30 自合同签订之日起至2022-07-30"
|
|
|
+def test_from_str():
|
|
|
+ # s = """
|
|
|
+ # """
|
|
|
+ s = "5元/年 服务期:交付使用之日起三年; 承诺服务等级"
|
|
|
print(extract_servicetime(s))
|
|
|
+ print(re.findall('(\d{2,4}[-.年/]|\d{1,2}[-.月/]|\d{1,2}[日号]?)+[-~~起至到—]+\d{2,4}[-.年/]', s))
|
|
|
+
|
|
|
+
|
|
|
+def test_from_csv():
|
|
|
+ df = pd.read_csv("D:/BIDI_DOC/招标方式_服务期限_提取/serviceTime_text.csv")
|
|
|
+ result_list = []
|
|
|
+ for index, row in df.iterrows():
|
|
|
+ result = extract_servicetime(row["text"])
|
|
|
+ result_list.append(str(result))
|
|
|
+
|
|
|
+ df["new_word"] = pd.DataFrame(result_list)
|
|
|
+ df.to_csv("D:/BIDI_DOC/招标方式_服务期限_提取/serviceTime_text_new.csv")
|
|
|
+
|
|
|
+
|
|
|
+def test_from_xlsx():
|
|
|
+ df = pd.read_excel("D:/BIDI_DOC/比地_文档/service_time_error.xlsx")
|
|
|
+ result_list = []
|
|
|
+ for index, row in df.iterrows():
|
|
|
+ text = row["dochtmlcon"]
|
|
|
+ soup = BeautifulSoup(text, "lxml")
|
|
|
+ text = soup.get_text(strip=True)
|
|
|
+ result = extract_servicetime(text)
|
|
|
+ result_list.append(str(result))
|
|
|
+
|
|
|
+ df["new_word"] = pd.DataFrame(result_list)
|
|
|
+ df.to_excel("D:/BIDI_DOC/比地_文档/service_time_error_new.xlsx", index=False)
|
|
|
+
|
|
|
+
|
|
|
+# def test_from_db():
|
|
|
+# engine = create_engine("mysql+pymysql://root:pwdformysql0922@192.168.2.170:3306/"
|
|
|
+# "exportdb?charset=utf8")
|
|
|
+# sql = 'SELECT docid, doctextcon, service_time_1 FROM `wuye_zhouqi_1` where service_time_1 <> "" and service_time_1 is not null;'
|
|
|
+# # 建立dataframe
|
|
|
+# df = pd.read_sql_query(sql, engine)
|
|
|
+# result_list = []
|
|
|
+# for index, row in df.iterrows():
|
|
|
+# result = extract_servicetime(row["doctextcon"])
|
|
|
+# if len(result) > 0:
|
|
|
+# temp = ""
|
|
|
+# for r in result:
|
|
|
+# temp += r.get("body") + "##"
|
|
|
+# result_list.append(temp)
|
|
|
+# else:
|
|
|
+# result_list.append(np.nan)
|
|
|
+#
|
|
|
+# df["new_service_time"] = pd.DataFrame(result_list)
|
|
|
+# df.to_excel("D:/BIDI_DOC/比地_文档/service_time_from_wuye_zhouqi.xlsx", index=False)
|
|
|
+
|
|
|
|
|
|
- # df = pd.read_csv("C:\\Users\\admin\\Desktop\\serviceTime_text.csv")
|
|
|
- # result_list = []
|
|
|
- # for index, row in df.iterrows():
|
|
|
- # result = extract_servicetime(row["text"])
|
|
|
- # result_list.append(str(result))
|
|
|
- #
|
|
|
- # df["new_word"] = pd.DataFrame(result_list)
|
|
|
- # df.to_csv("C:\\Users\\admin\\Desktop\\serviceTime_text_new.csv")
|
|
|
+if __name__ == '__main__':
|
|
|
+ test_from_str()
|