|
@@ -156,7 +156,7 @@ reg_right_unit = re.compile(u'[-.年月日号天~~至到—/]')
|
|
|
reg_error = re.compile(u'公告|发布|中')
|
|
|
|
|
|
|
|
|
-def re_serviceTime(text):
|
|
|
+def re_service_time(text):
|
|
|
if TEST_MODE:
|
|
|
# print(chardet.detect(text))
|
|
|
text = re.sub("\s*", "", text)
|
|
@@ -186,46 +186,53 @@ def re_serviceTime(text):
|
|
|
print("input_str", input_str)
|
|
|
|
|
|
# 匹配
|
|
|
- output_list, text_index_list = re_findAllResult(reg_wuye, input_str)
|
|
|
+ output_list, text_index_list = re_find_all_result(reg_wuye, input_str)
|
|
|
if TEST_MODE:
|
|
|
print("output_str, text_index reg_wuye", output_list, text_index_list)
|
|
|
output_list, text_index_list = filter_service_time(output_list, text_index_list)
|
|
|
+ prob = 0.9
|
|
|
|
|
|
if len(output_list) == 0:
|
|
|
- output_list, text_index_list = re_findAllResult(reg2, input_str)
|
|
|
+ output_list, text_index_list = re_find_all_result(reg2, input_str)
|
|
|
if TEST_MODE:
|
|
|
print("output_str, text_index reg2", output_list, text_index_list)
|
|
|
output_list, text_index_list = filter_service_time(output_list, text_index_list)
|
|
|
+ prob = 0.8
|
|
|
|
|
|
if len(output_list) == 0:
|
|
|
- output_list, text_index_list = re_findAllResult(reg, input_str)
|
|
|
+ output_list, text_index_list = re_find_all_result(reg, input_str)
|
|
|
if TEST_MODE:
|
|
|
print("output_str, text_index reg", output_list, text_index_list)
|
|
|
output_list, text_index_list = filter_service_time(output_list, text_index_list)
|
|
|
+ prob = 0.8
|
|
|
|
|
|
if len(output_list) == 0:
|
|
|
- output_list, text_index_list = re_findAllResult(reg1, input_str)
|
|
|
+ output_list, text_index_list = re_find_all_result(reg1, input_str)
|
|
|
if TEST_MODE:
|
|
|
print("output_str, text_index reg1", output_list, text_index_list)
|
|
|
output_list, text_index_list = filter_service_time(output_list, text_index_list)
|
|
|
+ prob = 0.8
|
|
|
|
|
|
if len(output_list) == 0:
|
|
|
- output_list, text_index_list = re_findAllResult(reg3, input_str)
|
|
|
+ output_list, text_index_list = re_find_all_result(reg3, input_str)
|
|
|
if TEST_MODE:
|
|
|
print("output_str, text_index reg3", output_list, text_index_list)
|
|
|
output_list, text_index_list = filter_service_time(output_list, text_index_list)
|
|
|
+ prob = 0.8
|
|
|
|
|
|
if len(output_list) == 0:
|
|
|
- output_list, text_index_list = re_findAllResult(reg4, input_str)
|
|
|
+ output_list, text_index_list = re_find_all_result(reg4, input_str)
|
|
|
if TEST_MODE:
|
|
|
print("output_str, text_index reg4", output_list, text_index_list)
|
|
|
output_list, text_index_list = filter_service_time(output_list, text_index_list)
|
|
|
+ prob = 0.5
|
|
|
|
|
|
if len(output_list) == 0:
|
|
|
- output_list, text_index_list = re_findAllResult(reg5, input_str)
|
|
|
+ output_list, text_index_list = re_find_all_result(reg5, input_str)
|
|
|
if TEST_MODE:
|
|
|
print("output_str, text_index reg5", output_list, text_index_list)
|
|
|
output_list, text_index_list = filter_service_time(output_list, text_index_list)
|
|
|
+ prob = 0.8
|
|
|
|
|
|
# 添加
|
|
|
all_output_list += output_list
|
|
@@ -240,7 +247,7 @@ def re_serviceTime(text):
|
|
|
|
|
|
if TEST_MODE:
|
|
|
print("index2word all_text_index_list", index2word, all_text_index_list)
|
|
|
- return index2word, all_text_index_list
|
|
|
+ return index2word, all_text_index_list, prob
|
|
|
|
|
|
|
|
|
def filter_service_time(output_list, text_index_list):
|
|
@@ -282,7 +289,7 @@ def filter_service_time(output_list, text_index_list):
|
|
|
return output_list, text_index_list
|
|
|
|
|
|
|
|
|
-def re_findAllResult(reg, input, unit="", index=0):
|
|
|
+def re_find_all_result(reg, input, unit="", index=0):
|
|
|
"""
|
|
|
|
|
|
:param reg: 正则表达式
|
|
@@ -296,61 +303,13 @@ def re_findAllResult(reg, input, unit="", index=0):
|
|
|
match1 = re.finditer(reg, input)
|
|
|
output_list = []
|
|
|
for i in match1:
|
|
|
- output = ""
|
|
|
d = i.groupdict()
|
|
|
- if d.get("before"):
|
|
|
- output += d.get("before")
|
|
|
- if d.get("before3"):
|
|
|
- output += d.get("before3")
|
|
|
- if d.get("before7"):
|
|
|
- output += d.get("before7")
|
|
|
- if d.get("charac"):
|
|
|
- output += d.get("charac")
|
|
|
- if d.get("before2"):
|
|
|
- output += d.get("before2")
|
|
|
- if d.get("before4"):
|
|
|
- output += d.get("before4")
|
|
|
- if d.get("before5"):
|
|
|
- output += d.get("before5")
|
|
|
- if d.get("before6"):
|
|
|
- output += d.get("before6")
|
|
|
- if d.get("center"):
|
|
|
- output += d.get("center")
|
|
|
- if d.get("number"):
|
|
|
- output += d.get("number")
|
|
|
- if d.get("after"):
|
|
|
- output += d.get("after")
|
|
|
- if d.get("after1"):
|
|
|
- output += d.get("after1")
|
|
|
- if d.get("after2"):
|
|
|
- output += d.get("after2")
|
|
|
- if d.get("after4"):
|
|
|
- output += d.get("after4")
|
|
|
- if d.get("after3"):
|
|
|
- output += d.get("after3")
|
|
|
|
|
|
if TEST_MODE:
|
|
|
for key in d.keys():
|
|
|
if d.get(key):
|
|
|
print('d.get("' + key + '")', d.get(key))
|
|
|
|
|
|
- # if d.get("before") is not None:
|
|
|
- # if d.get("before3") is None or d.get("before3") != "":
|
|
|
- # front_len = len(d.get("before"))
|
|
|
- # # print("1-", len(d.get("before")))
|
|
|
- # else:
|
|
|
- # front_len = len(d.get("before")) + len(d.get("charac"))
|
|
|
- # # print("2-", len(d.get("before")), len(d.get("charac")))
|
|
|
- # if d.get("before2") is not None:
|
|
|
- # front_len += len(d.get("before2"))
|
|
|
- # if d.get("before4") is not None:
|
|
|
- # front_len += len(d.get("before4"))
|
|
|
- # else:
|
|
|
- # if d.get("before2") is not None:
|
|
|
- # front_len = len(d.get("before2"))
|
|
|
- # else:
|
|
|
- # front_len = 0
|
|
|
-
|
|
|
front_len = 0
|
|
|
for key in d.keys():
|
|
|
if d.get(key) and key in ["before", "before2", "before4",
|
|
@@ -379,10 +338,10 @@ def calculateLen(ss, i):
|
|
|
|
|
|
def extract_servicetime(text):
|
|
|
list_servicetime = []
|
|
|
- word_list, text_index_list = re_serviceTime(text)
|
|
|
+ word_list, text_index_list, prob = re_service_time(text)
|
|
|
# print(word, text_index_list)
|
|
|
for i in range(len(text_index_list)):
|
|
|
- d = {"body": word_list[i], "begin_index": text_index_list[i][0], "end_index": text_index_list[i][1]}
|
|
|
+ d = {"body": word_list[i], "begin_index": text_index_list[i][0], "end_index": text_index_list[i][1], "prob": prob}
|
|
|
if len(word_list[i]) <= 35:
|
|
|
list_servicetime.append(d)
|
|
|
if TEST_MODE:
|