Ver Fonte

更新服务期限正则

fangjiasheng há 3 anos atrás
pai
commit
4d5e4b7279
1 ficheiros alterados com 381 adições e 138 exclusões
  1. 381 138
      BiddingKG/dl/time/re_servicetime.py

+ 381 - 138
BiddingKG/dl/time/re_servicetime.py

@@ -1,143 +1,370 @@
 #coding:UTF-8
 import re
 import pandas as pd
+import numpy as np
+from bs4 import BeautifulSoup
+# from sqlalchemy import create_engine
+
+TEST_MODE = False
+
+
+# before = '(?P<before>' \
+#          '合同期限|工期/交货期/服务期|工期,\(日历天\)|工期\(交货期\)|合格工期\(天\)|服务期限\(年\)|工期\(天\)' \
+#          '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
+#          '|合格工期|计划工期\(服务期\)|服务期\(日历天\)|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
+#          '|交货时间|工期\(日历天\)' \
+#          '|服务期限为|计划工期|工期要求|服务期限|服务期' \
+#          '|投标工期|设计工期|合格服务周期|总工期|服务时间|流转期限|维护期限|服务时限|交货期|服务要求' \
+#          '|完成时间|服务期限|中标工期|项目周期|期限要求|周期|工期|供货期|合同履行日期|计划周期|工期' \
+#          ')'
+
+before = '(?P<before>' \
+         '合同期限|工期/交货期/服务期|工期,|工期\(交货期\)|合格工期|服务期限|工期' \
+         '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
+         '|合格工期|计划工期\(服务期\)|服务期|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
+         '|交货时间|工期|质保期' \
+         '|服务期限为|计划工期|工期要求|服务期限|服务期' \
+         '|投标工期|设计工期|合格服务周期|总工期|服务时间(范围)?|流转期限|维护期限|服务时限|交货期' \
+         '|完成时间|服务期限|中标工期|项目周期|期限要求|周期|供货期|合同履行日期|计划周期' \
+         '|履约期限|合同约定完成时限|合同完成日期|承诺完成日期' \
+         '|合同起始日起|合同履约期|履约截止日期|承包期限|合同完成日期|特许经营期限' \
+         '|服务期间|服务履行期|委托(管理)?期限|经营期限' \
+         ')'
+
+
+# ^(?!.*abc).*$ 排除abc字符串
+before_wuye = '(?P<before>' \
+              '(履约期限、地点等简要信息[::]((履约|时间|期限){1,2}[::])?)' \
+              ')'
+# '|(履约期限、地点等简要信息[^\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,25})' \
+# (履约期限、地点等简要信息.{0,25}(?= [\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+([年月日]|个月)|20[21]))
+
+before2 = '(?P<before2>' \
+          '自合同签订之日起至|合同签订之日起|自合同签订之日起|签订合同后|系统开发' \
+          '|合同签订之日起至|自合同签订之日|合同签定后|自签订合同之日起|自合同签订起' \
+          '|[自从]?合同签[订定]生效之日起|自合同签订后不超过|合同签订日至' \
+          '|合同签订生效之日起|本项目招标有效期' \
+          '|[自从于]?签[订定署字](合同|协议书|协议)并?(期|开始履行|生效|有效期|约定|验收合格|期限|开始服务){0,2}(之[日后]|日期?[后起]|后|起|算|为)+[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,4}' \
+          '|[自从于]?(采购)?(合同|协议书|协议)(正式)?签[订定署字](完[成毕])?并?(期|开始履行|生效|有效期|约定|验收合格|期限|开始服务){0,2}(之[日后]|日期?[后起]|后|起|算|为)+[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,5}' \
+          '|签订合同起' \
+          '|项目的有效期限为|项目服务为|签订合同期为' \
+          '|(合同|协议书)签[订定署字]生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
+          '|[自从于]服务(合同|协议书|协议)生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
+          '|(本次)?采购周期' \
+          '|(项目招标)?履行期|[自从于]?(合同|协议书|协议)生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,3}' \
+          '|服务(有效期|年限)|本?合同有效期|(正式)?入驻(之[日后]|后|起|算)+' \
+          '|(合同|协议书|协议)生效(之[日后]|后|起|算)+' \
+          '|自?(提供服务|采购人指定|合同约定)(之[日后]|后|起|算)+' \
+          '|本?项目合同期(为|是)*' \
+          '|交付使用(之[日后]|后|起|算)+|' \
+          ')'
+        # '|[自从于].{2,15}之日[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
 
-def re_serviceTime(text):
+before3 = '(?P<before3>' \
+          '([\((]日历天[\))]|[\((]天[\))]|[\((]年[\))]|[\((]月[\))])?' \
+          ')'
 
-    text_list = []
-    text_list.append(text)
-    # 初始化
-    output_list = []
-    text_index_list = []
-
-    before = '(?P<before>' \
-             '工期/交货期/服务期|工期,\(日历天\)|工期\(交货期\)|合格工期\(天\)|服务期限\(年\)|工期\(天\)' \
-             '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
-             '|合格工期|计划工期\(服务期\)|服务期\(日历天\)|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
-             '|交货时间|工期\(日历天\)' \
-             '|服务期限为|计划工期|工期要求|服务期限|服务期' \
-             '|投标工期|设计工期|合格服务周期|总工期|服务时间|流转期限|维护期限|服务时限|交货期|服务要求' \
-             '|完成时间|服务期限|中标工期|项目周期|期限要求|周期|工期|供货期|合同履行日期|计划周期|工期' \
-             ')'
-
-    before2 = '(?P<before2>' \
-              '自合同签订之日起至|合同签订之日起|约|自合同签订之日起|开工后|不超过|签订合同后|系统开发' \
-              '|合同签订之日起至|自合同签订之日|合同签定后|自签订合同之日起|自合同签订起' \
-              '|自合同签订生效之日起|自合同签订后不超过|中选后|均为|合同签订日至|本项目合同期|' \
-              ')'
+before4 = '(?P<before4>' \
+          '(履约|[本项目原则上]*一招|期限|(服务|合同)(期|)|合计|均为|开工后|不超过|中选后|计划|达到|本合同|)' \
+          ')'
 
-    charac = '(?P<charac>' \
-             '[::,,]*' \
-             ')'
+charac = '(?P<charac>' \
+         '[::,,【()】]*' \
+         ')'
 
-    center = '(?P<center>' \
-             '[自]?\d+年\d+月\d+日至\d+年\d+月\d+日|\d+年\d+月\d+日|[\d一二三四五六七两叁贰壹肆伍]+' \
-             ')'
+# charac前后、center前、after1前 需加
+before5 = '(?P<before5>' \
+          '[自为约是起暂定的拟有效期限从共计至算是要求总服务到本项目]{0,5}' \
+          ')'
+before6 = '(?P<before6>' \
+          '[自为约是起暂定的拟有效期限从共计至算是要求总服务到本项目]{0,5}' \
+          ')'
+before7 = '(?P<before7>' \
+          '[自为约是起暂定的拟有效期限从共计至算是要求总服务到本项目]{0,5}' \
+          ')'
 
-    center1 = '(?P<center1>' \
-              '[自]?\d+年\d+月\d+日至\d+年\d+月\d+日|\d+年\d+月\d+日|[\d一二三四五六七两叁贰壹肆伍]+' \
-              ')'
+center = '(?P<center>' \
+         '(\d{2,4}[-.年/](\d{1,2}[-.月/])?(\d{0,2}[日号]?)?[-~~开始起至到—-]+(\d{2,4}[-.年/]\d{1,2}[-.月/]\d{0,2}[日号]?|\d{2,4}[-.年/]\d{1,2}[-.月/]?|\d{1,2}[-.月/]\d{1,2}[日号]?|\d{2,4}[-.年/]|\d{1,2}[-.月/]|\d{1,2}[日号]?)' \
+         '|\d{2,4}[-.年/]\d{1,2}[-.月/](\d{1,2}[日号]?)?' \
+         '|[+\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+)(\)|)' \
+         ')'
+
+center2 = '(?P<center2>' \
+          '[.\d]+个?[月年]' \
+          ')'
+
+number = '(?P<number>' \
+         '[\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+' \
+         ')'
+
+after = '(?P<after>' \
+        '[个,,(\(]*(日历|工作|学|)([年月日天周]|周年|整年)(内|)|\)|)|' \
+        ')'
+        # '|周|号|天|个月|个年|((|\(|)年()|\)|)|((|\(|)月()|\)|)|((|\(|)日()|\)|)' \
+        # '|个日历天|日历天|\(日历天\)|\(天\)|周内|,日历天|工作日|个工作日|' \
+
+after1 = '(?P<after1>' \
+         '\d{2,4}[-.年/](\d{1,2}[-.月/])?(\d{1,2}[日号])?[-~~开始起至到—]+(\d{2,4}[-.年/]\d{1,2}[-.月/]\d{0,2}[日号]?|\d{2,4}[-.年/]\d{1,2}[-.月/]?|\d{1,2}[-.月/]\d{1,2}[日号]?|\d{2,4}[-.年/]|\d{1,2}[-.月/]|\d{1,2}[日号]?)([】)]?)' \
+         ')'
+
+after2 = '(?P<after2>' \
+         '\d+' \
+         ')'
+
+after3 = '(?P<after3>' \
+         '(([\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾][年月日])?)' \
+         ')'
 
-    after = '(?P<after>' \
-            '天|个月|年|个日历天|日历天|日|\(日历天\)|\(天\)|周内|,日历天|' \
-            ')'
+after4 = '(?P<after4>' \
+         '[^\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,25}止' \
+         ')'
 
-    new1 = '(' \
-           '\d{4}年?(\d{1,2}月?)?(\d{1,2}日?)?-(\d{4}年?)?(\d{1,2}月?)?(\d{1,2}日?)?(-\d{1,2}日?)?' \
-           ')'
 
-    reg = re.compile(before + charac + before2 + center + after)
+reg = re.compile(before + before3 + before7 + charac + before5 + before2 + before4 + before6 + center + after)
 
-    reg1 = re.compile(before + charac + '(.*?止)')
+reg1 = re.compile(before + before3 + before7 + charac + before5 + after4 + after3)
 
-    reg2 = re.compile(before + charac + before2 + new1)
+reg2 = re.compile(before + before3 + before7 + charac + before5 + before2 + before6 + after1)
 
-    reg_not = re.compile(u'(工期延误|工期节点|工期管理|交付使用'
-                         u'|工期、)'
-                         u'|工期情况|划工期内|服务期内')
+reg3 = re.compile(before + before3 + before7 + charac + before5 + before2 + after2)
 
-    reg_not1 = re.compile(u'(履行日期:见|服务期限应按|签订合同前,'
-                          u'|务期限:1、|同签订日期:|证金在合同签|服务期限截止'
-                          u')')
+reg4 = re.compile(before2[:-2]+before2[-1:] + before5 + center + after)
 
-    reg_not2 = re.compile(u'截止|1\.|1、')
+reg5 = re.compile(before + before3 + before7 + charac + before5 + before2 + before4 + before6 + center2 + after)
+
+# reg4 = re.compile(before2[:-2]+before2[-1:] + number + after)
+# print(before2[:-2]+before2[-1:])
+
+reg_wuye = re.compile(before_wuye + before4 + before5 + center + after)
+
+reg_not = re.compile(u'(工期延误|工期节点|工期管理'
+                     u'|工期、|终止)'
+                     u'|工期情况|划工期内|服务期内'
+                     u'|(\d{1,2}:\d{1,2}(:\d{1,2})?)')
+
+reg_not1 = re.compile(u'(履行日期:见|服务期限应按|签订合同前,'
+                      u'|务期限:1、|签订日期|证金在合同签|服务期限截止'
+                      u')')
+
+# reg_not2 = re.compile(u'(截止|1\\.|1、)')
+# reg_not2 = re.compile(u'')
+
+reg_right_digit = re.compile(u'[\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+')
+
+reg_right_unit = re.compile(u'[-.年月日号天~~至到—/]')
+
+reg_error = re.compile(u'公告|发布|中')
+
+
+def re_serviceTime(text):
+    if TEST_MODE:
+        # print(chardet.detect(text))
+        text = re.sub("\s*", "", text)
+
+    text_list = []
+    text_list.append(text)
+    # 初始化
+    all_output_list = []
+    all_text_index_list = []
 
     for index in range(len(text_list)):
         # 初始化
-        output_str = ""
+        output_list = []
         input_str = text_list[index]
 
         # 替换混淆词
-        # input_str = re.sub(reg_not, "####", input_str)
-        # input_str = re.sub(reg_not1, "######", input_str)
-        # input_str = re.sub(reg_not2, "##", input_str)
-        for _reg_not in [reg_not,reg_not1,reg_not2]:
-            match = re.findall(_reg_not, input_str)
-            if match:
-                for word in match:
-                    instead = "#" * len(word)
-                    input_str = re.sub(word, instead, input_str)
-
-        output_str, text_index = re_findAllResult(reg2, input_str)
-        if len(text_index) == 0:
-            output_str, text_index = re_findAllResult(reg, input_str)
-            if len(text_index) == 0:
-                output_str, text_index = re_findAllResult(reg1, input_str)
+        for _reg_not in [reg_not, reg_not1]:
+            match_iter = re.finditer(_reg_not, input_str)
+            for match in match_iter:
+                word_index = match.span()
+                word = match.group()
+                instead = "#" * len(word)
+                print("word, instead, word_index", word, instead, word_index)
+                input_str = input_str[:word_index[0]] + instead + input_str[word_index[1]:]
+
+        if TEST_MODE:
+            print("input_str", input_str)
+
+        # 匹配
+        output_list, text_index_list = re_findAllResult(reg_wuye, input_str)
+        if TEST_MODE:
+            print("output_str, text_index reg_wuye", output_list, text_index_list)
+        output_list, text_index_list = filter_service_time(output_list, text_index_list)
+
+        if len(output_list) == 0:
+            output_list, text_index_list = re_findAllResult(reg2, input_str)
+            if TEST_MODE:
+                print("output_str, text_index reg2", output_list, text_index_list)
+            output_list, text_index_list = filter_service_time(output_list, text_index_list)
+
+        if len(output_list) == 0:
+            output_list, text_index_list = re_findAllResult(reg, input_str)
+            if TEST_MODE:
+                print("output_str, text_index reg", output_list, text_index_list)
+            output_list, text_index_list = filter_service_time(output_list, text_index_list)
+
+        if len(output_list) == 0:
+            output_list, text_index_list = re_findAllResult(reg1, input_str)
+            if TEST_MODE:
+                print("output_str, text_index reg1", output_list, text_index_list)
+            output_list, text_index_list = filter_service_time(output_list, text_index_list)
+
+        if len(output_list) == 0:
+            output_list, text_index_list = re_findAllResult(reg3, input_str)
+            if TEST_MODE:
+                print("output_str, text_index reg3", output_list, text_index_list)
+            output_list, text_index_list = filter_service_time(output_list, text_index_list)
+
+        if len(output_list) == 0:
+            output_list, text_index_list = re_findAllResult(reg4, input_str)
+            if TEST_MODE:
+                print("output_str, text_index reg4", output_list, text_index_list)
+            output_list, text_index_list = filter_service_time(output_list, text_index_list)
+
+        if len(output_list) == 0:
+            output_list, text_index_list = re_findAllResult(reg5, input_str)
+            if TEST_MODE:
+                print("output_str, text_index reg5", output_list, text_index_list)
+            output_list, text_index_list = filter_service_time(output_list, text_index_list)
 
         # 添加
-        output_list.append(output_str)
-        text_index_list.append(text_index)
+        all_output_list += output_list
+        all_text_index_list += text_index_list
 
     index2word = []
-    for index in range(len(text_index_list)):
-        word = ""
-        for i in range(len(text_index_list[index])):
-            word = word + text[text_index_list[index][i][0]:text_index_list[index][i][1]]
-            if word != len(text_index_list[index])-1:
-                word = word + " "
+    for i in range(len(all_text_index_list)):
+        word = text[all_text_index_list[i][0]:all_text_index_list[i][1]]
+        if i != len(all_text_index_list)-1:
+            word = word + " "
         index2word.append(word)
 
-    return index2word[0], text_index_list[0]
-
+    if TEST_MODE:
+        print("index2word all_text_index_list", index2word, all_text_index_list)
+    return index2word, all_text_index_list
+
+
+def filter_service_time(output_list, text_index_list):
+    # 过滤
+    delete_list = []
+    for i in range(len(output_list)):
+        output = output_list[i]
+
+        # 日期影响
+        if re.findall("日", output) and not re.findall(reg_right_unit, re.sub("日期", "", output)):
+            delete_list.append([output, text_index_list[i]])
+            print("delete output", output)
+            continue
+        # 不包含数字、单位的
+        if not re.findall(reg_right_digit, output):
+            delete_list.append([output, text_index_list[i]])
+            continue
+        if not re.findall(reg_right_unit, output):
+            delete_list.append([output, text_index_list[i]])
+            continue
+        # 包含不要的字
+        if re.findall(reg_error, output):
+            delete_list.append([output, text_index_list[i]])
+            continue
+        # 类似2019年的
+        if not re.findall("[-./月日天号]", output):
+            if len(re.findall("年", output)) == 1:
+                year_time = re.search("\d+", output)
+                if year_time is not None and int(year_time.group()) >= 2000:
+                    delete_list.append([output, text_index_list[i]])
+    for output, text_index in delete_list:
+        if output in output_list:
+            output_list.remove(output)
+        if text_index in text_index_list:
+            text_index_list.remove(text_index)
+
+    if TEST_MODE:
+        print("delete_list", delete_list)
+    return output_list, text_index_list
 
 
 def re_findAllResult(reg, input, unit="", index=0):
-    '''
+    """
 
     :param reg: 正则表达式
     :param input: 待匹配句子
     :param unit: 需要加的单位
     :param index: 字符串拼接的开始位置
     :return: 正则后的字符串
-    '''
-    match = re.findall(reg, input)
-    output = ""
-    # print(match)
-    if match:
-        ss = ""
-        for i in range(len(match)):
-            s = ""
-            for j in range(index, len(match[i])):
-                s = s + match[i][j]
-                if unit != "" and j == len(match[i])-1:
-                    s = s + unit
-            ss = ss + s
-            if i < len(match)-1:
-                ss = ss + " "
-        output = ss
-
+    """
     # 全文下标
     text_index = []
     match1 = re.finditer(reg, input)
+    output_list = []
     for i in match1:
+        output = ""
         d = i.groupdict()
-        if d.get("before") is not None:
-            front_len = len(d.get("before")) + len(d.get("charac"))
-        else:
-            front_len = 0
-        text_index.append([i.start()+front_len, i.end()])
+        if d.get("before"):
+            output += d.get("before")
+        if d.get("before3"):
+            output += d.get("before3")
+        if d.get("before7"):
+            output += d.get("before7")
+        if d.get("charac"):
+            output += d.get("charac")
+        if d.get("before2"):
+            output += d.get("before2")
+        if d.get("before4"):
+            output += d.get("before4")
+        if d.get("before5"):
+            output += d.get("before5")
+        if d.get("before6"):
+            output += d.get("before6")
+        if d.get("center"):
+            output += d.get("center")
+        if d.get("number"):
+            output += d.get("number")
+        if d.get("after"):
+            output += d.get("after")
+        if d.get("after1"):
+            output += d.get("after1")
+        if d.get("after2"):
+            output += d.get("after2")
+        if d.get("after4"):
+            output += d.get("after4")
+        if d.get("after3"):
+            output += d.get("after3")
+
+        if TEST_MODE:
+            for key in d.keys():
+                if d.get(key):
+                    print('d.get("' + key + '")', d.get(key))
+
+        # if d.get("before") is not None:
+        #     if d.get("before3") is None or d.get("before3") != "":
+        #         front_len = len(d.get("before"))
+        #         # print("1-", len(d.get("before")))
+        #     else:
+        #         front_len = len(d.get("before")) + len(d.get("charac"))
+        #         # print("2-", len(d.get("before")), len(d.get("charac")))
+        #         if d.get("before2") is not None:
+        #             front_len += len(d.get("before2"))
+        #     if d.get("before4") is not None:
+        #         front_len += len(d.get("before4"))
+        # else:
+        #     if d.get("before2") is not None:
+        #         front_len = len(d.get("before2"))
+        #     else:
+        #         front_len = 0
+
+        front_len = 0
+        for key in d.keys():
+            if d.get(key) and key in ["before", "before2", "before4",
+                                      "before5", "before6", "before7", "charac",
+                                      "after4"]:
+                front_len += len(d.get(key))
+        # 特殊情况
+        if d.get("before3"):
+            front_len -= len(d.get("before7"))
+            front_len -= len(d.get("charac"))
 
-    return output, text_index
+        text_index.append([i.start()+front_len, i.end()])
+        output_list.append(input[i.start()+front_len: i.end()])
+    return output_list, text_index
 
 
 def calculateLen(ss, i):
@@ -152,54 +379,70 @@ def calculateLen(ss, i):
 
 def extract_servicetime(text):
     list_servicetime = []
-    word, text_index_list = re_serviceTime(text)
+    word_list, text_index_list = re_serviceTime(text)
     # print(word, text_index_list)
     for i in range(len(text_index_list)):
-        word_list = word.split(" ")
         d = {"body": word_list[i], "begin_index": text_index_list[i][0], "end_index": text_index_list[i][1]}
         if len(word_list[i]) <= 35:
             list_servicetime.append(d)
-    # print(list_servicetime)
-
+    if TEST_MODE:
+        print("list_servicetime", list_servicetime)
     return list_servicetime
 
 
-if __name__ == '__main__':
-    s = """
-    4、其他:无
-
-合同履行期限:双方签订合同后30天内
-
-本项目( 不接受 )联合体投标。
-
-二、申请人的资格要求:
-
-1.满足《中华人民共和国政府采购法》第二十二条规定;
-
-2.落实政府采购政策需满足的资格要求:
-
-2.1具备《政府采购法》第二十二条规定的条件,且提供以下证明文件:
-
-(1)在中华人民共和国境内注册的法人或其他组织或自然人, 投标(响应)时提交有效的营业执照(或事业法人登记证或身份证等相关证明)副本复印件。
-
-(2)有依法缴纳税收和社会保障资金的良好记录:提供投标截止日前6个月内任意1个月依法缴纳税收和社会保障资金的相关材料。如依法免税或不需要缴纳社会保障资金的,提供相应证明材料。
-
-(3)供应商必须具有良好的商业信誉和健全的财务会计制度(提供2020年度财务状况报告或基本开户行出具的资信证明)。
-
-(4)履行合同所必需的设备和专业技术能力的证明材料或书面声明。
-
-(5)参加政府采购活动前 3 年内在经营活动中没有重大违法记录的书面声明。
-
-(6)信用记录:供应商未被列入“信用中国”网站(www.creditchina.gov.cn)“记录失信被执行人或重大税收违法案件当事人名单”记录名单; 不处于中国政府采购网(www.ccgp.gov.cn)“政府采购严重违法失信行为信息记录”中的禁止参加政府采购活动期间。
-    """
-    # s = "自合同签订之日起至2022-6-30 自合同签订之日起至2022-07-30"
+def test_from_str():
+    # s = """
+    # """
+    s = "5元/年 服务期:交付使用之日起三年; 承诺服务等级"
     print(extract_servicetime(s))
+    print(re.findall('(\d{2,4}[-.年/]|\d{1,2}[-.月/]|\d{1,2}[日号]?)+[-~~起至到—]+\d{2,4}[-.年/]', s))
+
+
+def test_from_csv():
+    df = pd.read_csv("D:/BIDI_DOC/招标方式_服务期限_提取/serviceTime_text.csv")
+    result_list = []
+    for index, row in df.iterrows():
+        result = extract_servicetime(row["text"])
+        result_list.append(str(result))
+
+    df["new_word"] = pd.DataFrame(result_list)
+    df.to_csv("D:/BIDI_DOC/招标方式_服务期限_提取/serviceTime_text_new.csv")
+
+
+def test_from_xlsx():
+    df = pd.read_excel("D:/BIDI_DOC/比地_文档/service_time_error.xlsx")
+    result_list = []
+    for index, row in df.iterrows():
+        text = row["dochtmlcon"]
+        soup = BeautifulSoup(text, "lxml")
+        text = soup.get_text(strip=True)
+        result = extract_servicetime(text)
+        result_list.append(str(result))
+
+    df["new_word"] = pd.DataFrame(result_list)
+    df.to_excel("D:/BIDI_DOC/比地_文档/service_time_error_new.xlsx", index=False)
+
+
+# def test_from_db():
+#     engine = create_engine("mysql+pymysql://root:pwdformysql0922@192.168.2.170:3306/"
+#                            "exportdb?charset=utf8")
+#     sql = 'SELECT docid, doctextcon, service_time_1 FROM `wuye_zhouqi_1` where service_time_1 <> "" and service_time_1 is not null;'
+#     # 建立dataframe
+#     df = pd.read_sql_query(sql, engine)
+#     result_list = []
+#     for index, row in df.iterrows():
+#         result = extract_servicetime(row["doctextcon"])
+#         if len(result) > 0:
+#             temp = ""
+#             for r in result:
+#                 temp += r.get("body") + "##"
+#             result_list.append(temp)
+#         else:
+#             result_list.append(np.nan)
+#
+#     df["new_service_time"] = pd.DataFrame(result_list)
+#     df.to_excel("D:/BIDI_DOC/比地_文档/service_time_from_wuye_zhouqi.xlsx", index=False)
+
 
-    # df = pd.read_csv("C:\\Users\\admin\\Desktop\\serviceTime_text.csv")
-    # result_list = []
-    # for index, row in df.iterrows():
-    #     result = extract_servicetime(row["text"])
-    #     result_list.append(str(result))
-    #
-    # df["new_word"] = pd.DataFrame(result_list)
-    # df.to_csv("C:\\Users\\admin\\Desktop\\serviceTime_text_new.csv")
+if __name__ == '__main__':
+    test_from_str()