re_servicetime.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. #coding:UTF-8
  2. import re
  3. import pandas as pd
  4. import numpy as np
  5. from bs4 import BeautifulSoup
  6. # from sqlalchemy import create_engine
  7. TEST_MODE = False
  8. # before = '(?P<before>' \
  9. # '合同期限|工期/交货期/服务期|工期,\(日历天\)|工期\(交货期\)|合格工期\(天\)|服务期限\(年\)|工期\(天\)' \
  10. # '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
  11. # '|合格工期|计划工期\(服务期\)|服务期\(日历天\)|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
  12. # '|交货时间|工期\(日历天\)' \
  13. # '|服务期限为|计划工期|工期要求|服务期限|服务期' \
  14. # '|投标工期|设计工期|合格服务周期|总工期|服务时间|流转期限|维护期限|服务时限|交货期|服务要求' \
  15. # '|完成时间|服务期限|中标工期|项目周期|期限要求|周期|工期|供货期|合同履行日期|计划周期|工期' \
  16. # ')'
  17. before = '(?P<before>' \
  18. '合同期限|工期/交货期/服务期|工期,|工期\(交货期\)|合格工期|服务期限|工期' \
  19. '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
  20. '|合格工期|计划工期\(服务期\)|服务期|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
  21. '|交货时间|工期|质保期' \
  22. '|服务期限为|计划工期|工期要求|服务期限|服务期' \
  23. '|投标工期|设计工期|合格服务周期|总工期|服务时间(范围)?|流转期限|维护期限|服务时限|交货期' \
  24. '|完成时间|服务期限|中标工期|项目周期|期限要求|周期|供货期|合同履行日期|计划周期' \
  25. '|履约期限|合同约定完成时限|合同完成日期|承诺完成日期' \
  26. '|合同起始日起|合同履约期|履约截止日期|承包期限|合同完成日期|特许经营期限' \
  27. '|服务期间|服务履行期|委托(管理)?期限|经营期限' \
  28. ')'
  29. # ^(?!.*abc).*$ 排除abc字符串
  30. before_wuye = '(?P<before>' \
  31. '(履约期限、地点等简要信息[::]((履约|时间|期限){1,2}[::])?)' \
  32. ')'
  33. # '|(履约期限、地点等简要信息[^\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,25})' \
  34. # (履约期限、地点等简要信息.{0,25}(?= [\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+([年月日]|个月)|20[21]))
  35. before2 = '(?P<before2>' \
  36. '自合同签订之日起至|合同签订之日起|自合同签订之日起|签订合同后|系统开发' \
  37. '|合同签订之日起至|自合同签订之日|合同签定后|自签订合同之日起|自合同签订起' \
  38. '|[自从]?合同签[订定]生效之日起|自合同签订后不超过|合同签订日至' \
  39. '|合同签订生效之日起|本项目招标有效期' \
  40. '|[自从于]?签[订定署字](合同|协议书|协议)并?(期|开始履行|生效|有效期|约定|验收合格|期限|开始服务){0,2}(之[日后]|日期?[后起]|后|起|算|为)+[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,4}' \
  41. '|[自从于]?(采购)?(合同|协议书|协议)(正式)?签[订定署字](完[成毕])?并?(期|开始履行|生效|有效期|约定|验收合格|期限|开始服务){0,2}(之[日后]|日期?[后起]|后|起|算|为)+[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,5}' \
  42. '|签订合同起' \
  43. '|项目的有效期限为|项目服务为|签订合同期为' \
  44. '|(合同|协议书)签[订定署字]生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
  45. '|[自从于]服务(合同|协议书|协议)生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
  46. '|(本次)?采购周期' \
  47. '|(项目招标)?履行期|[自从于]?(合同|协议书|协议)生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,3}' \
  48. '|服务(有效期|年限)|本?合同有效期|(正式)?入驻(之[日后]|后|起|算)+' \
  49. '|(合同|协议书|协议)生效(之[日后]|后|起|算)+' \
  50. '|自?(提供服务|采购人指定|合同约定)(之[日后]|后|起|算)+' \
  51. '|本?项目合同期(为|是)*' \
  52. '|交付使用(之[日后]|后|起|算)+|' \
  53. ')'
  54. # '|[自从于].{2,15}之日[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
  55. before3 = '(?P<before3>' \
  56. '([\((]日历天[\))]|[\((]天[\))]|[\((]年[\))]|[\((]月[\))])?' \
  57. ')'
  58. before4 = '(?P<before4>' \
  59. '(履约|[本项目原则上]*一招|期限|(服务|合同)(期|)|合计|均为|开工后|不超过|中选后|计划|达到|本合同|)' \
  60. ')'
  61. charac = '(?P<charac>' \
  62. '[::,,【()】]*' \
  63. ')'
  64. # charac前后、center前、after1前 需加
  65. before5 = '(?P<before5>' \
  66. '[自为约是起暂定的拟有效期限从共计至算是要求总服务到本项目]{0,5}' \
  67. ')'
  68. before6 = '(?P<before6>' \
  69. '[自为约是起暂定的拟有效期限从共计至算是要求总服务到本项目]{0,5}' \
  70. ')'
  71. before7 = '(?P<before7>' \
  72. '[自为约是起暂定的拟有效期限从共计至算是要求总服务到本项目]{0,5}' \
  73. ')'
  74. center = '(?P<center>' \
  75. '(\d{2,4}[-.年/](\d{1,2}[-.月/])?(\d{0,2}[日号]?)?[-~~开始起至到—-]+(\d{2,4}[-.年/]\d{1,2}[-.月/]\d{0,2}[日号]?|\d{2,4}[-.年/]\d{1,2}[-.月/]?|\d{1,2}[-.月/]\d{1,2}[日号]?|\d{2,4}[-.年/]|\d{1,2}[-.月/]|\d{1,2}[日号]?)' \
  76. '|\d{2,4}[-.年/]\d{1,2}[-.月/](\d{1,2}[日号]?)?' \
  77. '|[+\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+)(\)|)' \
  78. ')'
  79. center2 = '(?P<center2>' \
  80. '[.\d]+个?[月年]' \
  81. ')'
  82. number = '(?P<number>' \
  83. '[\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+' \
  84. ')'
  85. after = '(?P<after>' \
  86. '[个,,(\(]*(日历|工作|学|)([年月日天周]|周年|整年)(内|)|\)|)|' \
  87. ')'
  88. # '|周|号|天|个月|个年|((|\(|)年()|\)|)|((|\(|)月()|\)|)|((|\(|)日()|\)|)' \
  89. # '|个日历天|日历天|\(日历天\)|\(天\)|周内|,日历天|工作日|个工作日|' \
  90. after1 = '(?P<after1>' \
  91. '\d{2,4}[-.年/](\d{1,2}[-.月/])?(\d{1,2}[日号])?[-~~开始起至到—]+(\d{2,4}[-.年/]\d{1,2}[-.月/]\d{0,2}[日号]?|\d{2,4}[-.年/]\d{1,2}[-.月/]?|\d{1,2}[-.月/]\d{1,2}[日号]?|\d{2,4}[-.年/]|\d{1,2}[-.月/]|\d{1,2}[日号]?)([】)]?)' \
  92. ')'
  93. after2 = '(?P<after2>' \
  94. '\d+' \
  95. ')'
  96. after3 = '(?P<after3>' \
  97. '(([\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾][年月日])?)' \
  98. ')'
  99. after4 = '(?P<after4>' \
  100. '[^\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,25}止' \
  101. ')'
  102. reg = re.compile(before + before3 + before7 + charac + before5 + before2 + before4 + before6 + center + after)
  103. reg1 = re.compile(before + before3 + before7 + charac + before5 + after4 + after3)
  104. reg2 = re.compile(before + before3 + before7 + charac + before5 + before2 + before6 + after1)
  105. reg3 = re.compile(before + before3 + before7 + charac + before5 + before2 + after2)
  106. reg4 = re.compile(before2[:-2]+before2[-1:] + before5 + center + after)
  107. reg5 = re.compile(before + before3 + before7 + charac + before5 + before2 + before4 + before6 + center2 + after)
  108. # reg4 = re.compile(before2[:-2]+before2[-1:] + number + after)
  109. # print(before2[:-2]+before2[-1:])
  110. reg_wuye = re.compile(before_wuye + before4 + before5 + center + after)
  111. reg_not = re.compile(u'(工期延误|工期节点|工期管理'
  112. u'|工期、|终止)'
  113. u'|工期情况|划工期内|服务期内'
  114. u'|(\d{1,2}:\d{1,2}(:\d{1,2})?)')
  115. reg_not1 = re.compile(u'(履行日期:见|服务期限应按|签订合同前,'
  116. u'|务期限:1、|签订日期|证金在合同签|服务期限截止'
  117. u')')
  118. # reg_not2 = re.compile(u'(截止|1\\.|1、)')
  119. # reg_not2 = re.compile(u'')
  120. reg_right_digit = re.compile(u'[\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+')
  121. reg_right_unit = re.compile(u'[-.年月日号天~~至到—/]')
  122. reg_error = re.compile(u'公告|发布|中')
  123. def re_serviceTime(text):
  124. if TEST_MODE:
  125. # print(chardet.detect(text))
  126. text = re.sub("\s*", "", text)
  127. text_list = []
  128. text_list.append(text)
  129. # 初始化
  130. all_output_list = []
  131. all_text_index_list = []
  132. for index in range(len(text_list)):
  133. # 初始化
  134. output_list = []
  135. input_str = text_list[index]
  136. # 替换混淆词
  137. for _reg_not in [reg_not, reg_not1]:
  138. match_iter = re.finditer(_reg_not, input_str)
  139. for match in match_iter:
  140. word_index = match.span()
  141. word = match.group()
  142. instead = "#" * len(word)
  143. print("word, instead, word_index", word, instead, word_index)
  144. input_str = input_str[:word_index[0]] + instead + input_str[word_index[1]:]
  145. if TEST_MODE:
  146. print("input_str", input_str)
  147. # 匹配
  148. output_list, text_index_list = re_findAllResult(reg_wuye, input_str)
  149. if TEST_MODE:
  150. print("output_str, text_index reg_wuye", output_list, text_index_list)
  151. output_list, text_index_list = filter_service_time(output_list, text_index_list)
  152. if len(output_list) == 0:
  153. output_list, text_index_list = re_findAllResult(reg2, input_str)
  154. if TEST_MODE:
  155. print("output_str, text_index reg2", output_list, text_index_list)
  156. output_list, text_index_list = filter_service_time(output_list, text_index_list)
  157. if len(output_list) == 0:
  158. output_list, text_index_list = re_findAllResult(reg, input_str)
  159. if TEST_MODE:
  160. print("output_str, text_index reg", output_list, text_index_list)
  161. output_list, text_index_list = filter_service_time(output_list, text_index_list)
  162. if len(output_list) == 0:
  163. output_list, text_index_list = re_findAllResult(reg1, input_str)
  164. if TEST_MODE:
  165. print("output_str, text_index reg1", output_list, text_index_list)
  166. output_list, text_index_list = filter_service_time(output_list, text_index_list)
  167. if len(output_list) == 0:
  168. output_list, text_index_list = re_findAllResult(reg3, input_str)
  169. if TEST_MODE:
  170. print("output_str, text_index reg3", output_list, text_index_list)
  171. output_list, text_index_list = filter_service_time(output_list, text_index_list)
  172. if len(output_list) == 0:
  173. output_list, text_index_list = re_findAllResult(reg4, input_str)
  174. if TEST_MODE:
  175. print("output_str, text_index reg4", output_list, text_index_list)
  176. output_list, text_index_list = filter_service_time(output_list, text_index_list)
  177. if len(output_list) == 0:
  178. output_list, text_index_list = re_findAllResult(reg5, input_str)
  179. if TEST_MODE:
  180. print("output_str, text_index reg5", output_list, text_index_list)
  181. output_list, text_index_list = filter_service_time(output_list, text_index_list)
  182. # 添加
  183. all_output_list += output_list
  184. all_text_index_list += text_index_list
  185. index2word = []
  186. for i in range(len(all_text_index_list)):
  187. word = text[all_text_index_list[i][0]:all_text_index_list[i][1]]
  188. if i != len(all_text_index_list)-1:
  189. word = word + " "
  190. index2word.append(word)
  191. if TEST_MODE:
  192. print("index2word all_text_index_list", index2word, all_text_index_list)
  193. return index2word, all_text_index_list
  194. def filter_service_time(output_list, text_index_list):
  195. # 过滤
  196. delete_list = []
  197. for i in range(len(output_list)):
  198. output = output_list[i]
  199. # 日期影响
  200. if re.findall("日", output) and not re.findall(reg_right_unit, re.sub("日期", "", output)):
  201. delete_list.append([output, text_index_list[i]])
  202. print("delete output", output)
  203. continue
  204. # 不包含数字、单位的
  205. if not re.findall(reg_right_digit, output):
  206. delete_list.append([output, text_index_list[i]])
  207. continue
  208. if not re.findall(reg_right_unit, output):
  209. delete_list.append([output, text_index_list[i]])
  210. continue
  211. # 包含不要的字
  212. if re.findall(reg_error, output):
  213. delete_list.append([output, text_index_list[i]])
  214. continue
  215. # 类似2019年的
  216. if not re.findall("[-./月日天号]", output):
  217. if len(re.findall("年", output)) == 1:
  218. year_time = re.search("\d+", output)
  219. if year_time is not None and int(year_time.group()) >= 2000:
  220. delete_list.append([output, text_index_list[i]])
  221. for output, text_index in delete_list:
  222. if output in output_list:
  223. output_list.remove(output)
  224. if text_index in text_index_list:
  225. text_index_list.remove(text_index)
  226. if TEST_MODE:
  227. print("delete_list", delete_list)
  228. return output_list, text_index_list
  229. def re_findAllResult(reg, input, unit="", index=0):
  230. """
  231. :param reg: 正则表达式
  232. :param input: 待匹配句子
  233. :param unit: 需要加的单位
  234. :param index: 字符串拼接的开始位置
  235. :return: 正则后的字符串
  236. """
  237. # 全文下标
  238. text_index = []
  239. match1 = re.finditer(reg, input)
  240. output_list = []
  241. for i in match1:
  242. output = ""
  243. d = i.groupdict()
  244. if d.get("before"):
  245. output += d.get("before")
  246. if d.get("before3"):
  247. output += d.get("before3")
  248. if d.get("before7"):
  249. output += d.get("before7")
  250. if d.get("charac"):
  251. output += d.get("charac")
  252. if d.get("before2"):
  253. output += d.get("before2")
  254. if d.get("before4"):
  255. output += d.get("before4")
  256. if d.get("before5"):
  257. output += d.get("before5")
  258. if d.get("before6"):
  259. output += d.get("before6")
  260. if d.get("center"):
  261. output += d.get("center")
  262. if d.get("number"):
  263. output += d.get("number")
  264. if d.get("after"):
  265. output += d.get("after")
  266. if d.get("after1"):
  267. output += d.get("after1")
  268. if d.get("after2"):
  269. output += d.get("after2")
  270. if d.get("after4"):
  271. output += d.get("after4")
  272. if d.get("after3"):
  273. output += d.get("after3")
  274. if TEST_MODE:
  275. for key in d.keys():
  276. if d.get(key):
  277. print('d.get("' + key + '")', d.get(key))
  278. # if d.get("before") is not None:
  279. # if d.get("before3") is None or d.get("before3") != "":
  280. # front_len = len(d.get("before"))
  281. # # print("1-", len(d.get("before")))
  282. # else:
  283. # front_len = len(d.get("before")) + len(d.get("charac"))
  284. # # print("2-", len(d.get("before")), len(d.get("charac")))
  285. # if d.get("before2") is not None:
  286. # front_len += len(d.get("before2"))
  287. # if d.get("before4") is not None:
  288. # front_len += len(d.get("before4"))
  289. # else:
  290. # if d.get("before2") is not None:
  291. # front_len = len(d.get("before2"))
  292. # else:
  293. # front_len = 0
  294. front_len = 0
  295. for key in d.keys():
  296. if d.get(key) and key in ["before", "before2", "before4",
  297. "before5", "before6", "before7", "charac",
  298. "after4"]:
  299. front_len += len(d.get(key))
  300. # 特殊情况
  301. if d.get("before3"):
  302. front_len -= len(d.get("before7"))
  303. front_len -= len(d.get("charac"))
  304. text_index.append([i.start()+front_len, i.end()])
  305. output_list.append(input[i.start()+front_len: i.end()])
  306. return output_list, text_index
  307. def calculateLen(ss, i):
  308. front_len = 0
  309. back_len = 0
  310. for index in range(i):
  311. front_len += len(ss[index])
  312. for index in range(i+1, len(ss)):
  313. back_len += len(ss[index])
  314. return front_len, back_len
  315. def extract_servicetime(text):
  316. list_servicetime = []
  317. word_list, text_index_list = re_serviceTime(text)
  318. # print(word, text_index_list)
  319. for i in range(len(text_index_list)):
  320. d = {"body": word_list[i], "begin_index": text_index_list[i][0], "end_index": text_index_list[i][1]}
  321. if len(word_list[i]) <= 35:
  322. list_servicetime.append(d)
  323. if TEST_MODE:
  324. print("list_servicetime", list_servicetime)
  325. return list_servicetime
  326. def test_from_str():
  327. # s = """
  328. # """
  329. s = "5元/年 服务期:交付使用之日起三年; 承诺服务等级"
  330. print(extract_servicetime(s))
  331. print(re.findall('(\d{2,4}[-.年/]|\d{1,2}[-.月/]|\d{1,2}[日号]?)+[-~~起至到—]+\d{2,4}[-.年/]', s))
  332. def test_from_csv():
  333. df = pd.read_csv("D:/BIDI_DOC/招标方式_服务期限_提取/serviceTime_text.csv")
  334. result_list = []
  335. for index, row in df.iterrows():
  336. result = extract_servicetime(row["text"])
  337. result_list.append(str(result))
  338. df["new_word"] = pd.DataFrame(result_list)
  339. df.to_csv("D:/BIDI_DOC/招标方式_服务期限_提取/serviceTime_text_new.csv")
  340. def test_from_xlsx():
  341. df = pd.read_excel("D:/BIDI_DOC/比地_文档/service_time_error.xlsx")
  342. result_list = []
  343. for index, row in df.iterrows():
  344. text = row["dochtmlcon"]
  345. soup = BeautifulSoup(text, "lxml")
  346. text = soup.get_text(strip=True)
  347. result = extract_servicetime(text)
  348. result_list.append(str(result))
  349. df["new_word"] = pd.DataFrame(result_list)
  350. df.to_excel("D:/BIDI_DOC/比地_文档/service_time_error_new.xlsx", index=False)
  351. # def test_from_db():
  352. # engine = create_engine("mysql+pymysql://root:pwdformysql0922@192.168.2.170:3306/"
  353. # "exportdb?charset=utf8")
  354. # sql = 'SELECT docid, doctextcon, service_time_1 FROM `wuye_zhouqi_1` where service_time_1 <> "" and service_time_1 is not null;'
  355. # # 建立dataframe
  356. # df = pd.read_sql_query(sql, engine)
  357. # result_list = []
  358. # for index, row in df.iterrows():
  359. # result = extract_servicetime(row["doctextcon"])
  360. # if len(result) > 0:
  361. # temp = ""
  362. # for r in result:
  363. # temp += r.get("body") + "##"
  364. # result_list.append(temp)
  365. # else:
  366. # result_list.append(np.nan)
  367. #
  368. # df["new_service_time"] = pd.DataFrame(result_list)
  369. # df.to_excel("D:/BIDI_DOC/比地_文档/service_time_from_wuye_zhouqi.xlsx", index=False)
  370. if __name__ == '__main__':
  371. test_from_str()