|
@@ -199,7 +199,7 @@ def tableToText(soup):
|
|
|
'''
|
|
|
@summary: 计算每个节点受到的挤压度来判断是否需要染色
|
|
|
'''
|
|
|
- #print("B",inner_table[index])
|
|
|
+ ## print("B",inner_table[index])
|
|
|
min_presure = 3
|
|
|
list_dye = []
|
|
|
first = None
|
|
@@ -260,7 +260,7 @@ def tableToText(soup):
|
|
|
dye_set.add((inner_table[index][h][0],dye_type))
|
|
|
key_set.add(inner_table[index][h][0])
|
|
|
begin = end
|
|
|
- #print("E",inner_table[index])
|
|
|
+ ## print("E",inner_table[index])
|
|
|
|
|
|
|
|
|
|
|
@@ -388,17 +388,17 @@ def tableToText(soup):
|
|
|
|
|
|
for item,values in zip(list_item,list(predict_y)):
|
|
|
_dict[item] = values[1]
|
|
|
- # print("##",item,values)
|
|
|
- #print(_dict)
|
|
|
+ # # print("##",item,values)
|
|
|
+ ## print(_dict)
|
|
|
for i in range(height):
|
|
|
for j in range(width):
|
|
|
item = inner_table[i][j][0]
|
|
|
inner_table[i][j][1] = 1 if _dict[item]>prob_min else (1 if re.search(pat_head,item) is not None and len(item)<8 else 0)
|
|
|
|
|
|
- # print("=====")
|
|
|
+ # # print("=====")
|
|
|
# for item in inner_table:
|
|
|
- # print(item)
|
|
|
- # print("======")
|
|
|
+ # # print(item)
|
|
|
+ # # print("======")
|
|
|
|
|
|
repairTable(inner_table)
|
|
|
head_list = sliceTable(inner_table)
|
|
@@ -422,10 +422,10 @@ def tableToText(soup):
|
|
|
if re.search(pat_head,_item) is not None and len(item)<8:
|
|
|
inner_table[_h][_w][1] = 1
|
|
|
|
|
|
- # print("=====")
|
|
|
+ # # print("=====")
|
|
|
# for item in inner_table:
|
|
|
- # print(item)
|
|
|
- # print("======")
|
|
|
+ # # print(item)
|
|
|
+ # # print("======")
|
|
|
|
|
|
repairTable(inner_table)
|
|
|
head_list = sliceTable(inner_table)
|
|
@@ -470,7 +470,7 @@ def tableToText(soup):
|
|
|
else:
|
|
|
is_head = False
|
|
|
|
|
|
- #print(temp_item,form_prob)
|
|
|
+ ## print(temp_item,form_prob)
|
|
|
if len(inner_table[i][0][0])>40:
|
|
|
is_long_value = True
|
|
|
if is_head or is_long_value or is_same_value:
|
|
@@ -751,12 +751,12 @@ def tableToText(soup):
|
|
|
pack_text += head+cell["text"]+","
|
|
|
elif re.search(rankPattern,head) is not None: # 2020/11/23 大网站规则发现问题,if 改elif
|
|
|
#排名替换为同一种表达
|
|
|
- print("====",head)
|
|
|
+ # print("====",head)
|
|
|
rank_text += head+cell["text"]+","
|
|
|
- #print(rank_text)
|
|
|
+ ## print(rank_text)
|
|
|
elif re.search(entityPattern,head) is not None:
|
|
|
entity_text += head+cell["text"]+","
|
|
|
- #print(entity_text)
|
|
|
+ ## print(entity_text)
|
|
|
else:
|
|
|
if re.search(moneyPattern,head) is not None and entity_text!="":
|
|
|
money_text += head+cell["text"]+","
|
|
@@ -788,10 +788,10 @@ def tableToText(soup):
|
|
|
elif re.search(rankPattern,head) is not None: # 2020/11/23 大网站规则发现问题,if 改elif
|
|
|
#排名替换为同一种表达
|
|
|
rank_text += head+cell["text"]+","
|
|
|
- #print(rank_text)
|
|
|
+ ## print(rank_text)
|
|
|
elif re.search(entityPattern,head) is not None:
|
|
|
entity_text += head+cell["text"]+","
|
|
|
- #print(entity_text)
|
|
|
+ ## print(entity_text)
|
|
|
else:
|
|
|
text_line += head+cell["text"]+","
|
|
|
text_set.add(str(head+cell["text"]))
|
|
@@ -862,10 +862,10 @@ def tableToText(soup):
|
|
|
# elif re.search(rankPattern,head) is not None: # 2020/11/23 大网站规则发现问题,if 改elif
|
|
|
# #排名替换为同一种表达
|
|
|
# rank_text += head+inner_table[i][j][0]+","
|
|
|
- # #print(rank_text)
|
|
|
+ # ## print(rank_text)
|
|
|
# elif re.search(entityPattern,head) is not None:
|
|
|
# entity_text += head+inner_table[i][j][0]+","
|
|
|
- # #print(entity_text)
|
|
|
+ # ## print(entity_text)
|
|
|
# else:
|
|
|
# text_line += head+inner_table[i][j][0]+","
|
|
|
# text_set.add(str(head+inner_table[i][j][0]))
|
|
@@ -924,10 +924,10 @@ def tableToText(soup):
|
|
|
# continue
|
|
|
# if re.search(rankPattern,head) is not None:
|
|
|
# rank_text += head+inner_table[i][j][0]+","
|
|
|
- # #print(rank_text)
|
|
|
+ # ## print(rank_text)
|
|
|
# elif re.search(entityPattern,head) is not None:
|
|
|
# entity_text += head+inner_table[i][j][0]+","
|
|
|
- # #print(entity_text)
|
|
|
+ # ## print(entity_text)
|
|
|
# else:
|
|
|
# text_line += head+inner_table[i][j][0]+","
|
|
|
# text_set.add(str(head+inner_table[i][j][0]))
|
|
@@ -952,22 +952,22 @@ def tableToText(soup):
|
|
|
#inner_table,head_list = setHead_inline(inner_table)
|
|
|
inner_table,head_list = setHead_initem(inner_table,pat_head)
|
|
|
# inner_table,head_list = setHead_incontext(inner_table,pat_head)
|
|
|
- # print(inner_table)
|
|
|
+ # # print(inner_table)
|
|
|
# for begin in range(len(head_list[:-1])):
|
|
|
# for item in inner_table[head_list[begin]:head_list[begin+1]]:
|
|
|
- # print(item)
|
|
|
- # print("====")
|
|
|
+ # # print(item)
|
|
|
+ # # print("====")
|
|
|
|
|
|
removeFix(inner_table)
|
|
|
|
|
|
- # print("----")
|
|
|
- # print(head_list)
|
|
|
+ # # print("----")
|
|
|
+ # # print(head_list)
|
|
|
# for item in inner_table:
|
|
|
- # print(item)
|
|
|
+ # # print(item)
|
|
|
|
|
|
|
|
|
tbody.string = getTableText(inner_table,head_list)
|
|
|
- #print(tbody.string)
|
|
|
+ ## print(tbody.string)
|
|
|
tbody.name = "turntable"
|
|
|
return inner_table
|
|
|
return None
|
|
@@ -998,9 +998,9 @@ def tableToText(soup):
|
|
|
|
|
|
#数据清洗
|
|
|
def segment(soup,final=True):
|
|
|
- # print("==")
|
|
|
- # print(soup)
|
|
|
- # print("====")
|
|
|
+ # # print("==")
|
|
|
+ # # print(soup)
|
|
|
+ # # print("====")
|
|
|
#segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
|
|
|
subspaceList = ["td",'a',"span","p"]
|
|
|
if soup.name in subspaceList:
|
|
@@ -1223,7 +1223,7 @@ def union_ner(list_ner):
|
|
|
if i not in union_index_set:
|
|
|
result_list.append(list_ner[i])
|
|
|
for item in union_index:
|
|
|
- #print(str(list_ner[item[0]][3])+str(list_ner[item[1]][3]))
|
|
|
+ ## print(str(list_ner[item[0]][3])+str(list_ner[item[1]][3]))
|
|
|
result_list.append((list_ner[item[0]][0],list_ner[item[1]][1],'company',str(list_ner[item[0]][3])+str(list_ner[item[1]][3])))
|
|
|
return result_list
|
|
|
|
|
@@ -1358,8 +1358,8 @@ def union_ner(list_ner):
|
|
|
# index = 0
|
|
|
# for i in range(len(all_match)):
|
|
|
# if len(all_match[i][0])>0:
|
|
|
-# # print("===",all_match[i])
|
|
|
-# #print(all_match[i][0])
|
|
|
+# # # print("===",all_match[i])
|
|
|
+# ## print(all_match[i][0])
|
|
|
# unit = ""
|
|
|
# entity_text = all_match[i][3]
|
|
|
# if pattern_key in ["key_word","front_m"]:
|
|
@@ -1570,6 +1570,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
|
#限流执行
|
|
|
key_nerToken = "nerToken"
|
|
|
start_time = time.time()
|
|
|
+ found_yeji = 0 # 2021/8/6 增加判断是否正文包含评标结果 及类似业绩判断用于过滤后面的金额
|
|
|
+ # found_pingbiao = False
|
|
|
ner_entitys_all = getNers(sentences,useselffool=useselffool)
|
|
|
if key_nerToken not in cost_time:
|
|
|
cost_time[key_nerToken] = 0
|
|
@@ -1627,10 +1629,16 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
|
entity_type = "money"
|
|
|
#money_patten_str = "(([1-9][\d,,]*(?:\.\d+)?[百千万亿]?[\(\)()元整]+)|([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})|(?:[¥¥]+,?|报价|标价)[(\(]?([万])?元?[)\)]?[::]?.{,7}?([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]?)|([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]?)[\((]?([万元]{1,2}))*"
|
|
|
|
|
|
+ # list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
|
|
|
+ # "key_word":"((?P<text_key_word>(?:[¥¥]+,?|[单报标限]价|金额|价格|标的基本情况|CNY|成交结果:)(?:[,(\(]*\s*(?P<unit_key_word_before>[万元]*(?P<filter_unit2>[台个只]*))\s*[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,8}?))(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿元]*)(?:[(\(]?(?P<filter_>[%])*\s*(?P<unit_key_word_behind>[万元]*(?P<filter_unit1>[台个只]*))\s*[)\)]?))",
|
|
|
+ # "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万元]+)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿元]*)())",
|
|
|
+ # "behind_m":"(()()(?P<money_behind_m>[0-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]*)[\((]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只]*))[\))]?)"}
|
|
|
list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
|
|
|
- "key_word":"((?P<text_key_word>(?:[¥¥]+,?|[单报标限]价|金额|价格|标的基本情况|CNY|成交结果:)(?:[,(\(]*\s*(?P<unit_key_word_before>[万元]*(?P<filter_unit2>[台个只]*))\s*[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,8}?))(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿元]*)(?:[(\(]?(?P<filter_>[%])*\s*(?P<unit_key_word_behind>[万元]*(?P<filter_unit1>[台个只]*))\s*[)\)]?))",
|
|
|
+ "key_word":"((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|标的基本情况|CNY|成交结果:)(?:[,(\(]*\s*(?P<unit_key_word_before>[万元]*(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号]{,8}?))(第[123一二三]名[::])?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万元]*(?P<filter_unit1>[台个只吨斤棵株页亩方条米]*))\s*[)\)]?))",
|
|
|
"front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万元]+)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿元]*)())",
|
|
|
- "behind_m":"(()()(?P<money_behind_m>[0-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]*)[\((]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只]*))[\))]?)"}
|
|
|
+ "behind_m":"(()()(?P<money_behind_m>[0-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]*)[\((]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
|
|
|
+ # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
|
|
|
+
|
|
|
pattern_money = re.compile("%s|%s|%s|%s"%(list_money_pattern["cn"],list_money_pattern["key_word"],list_money_pattern["behind_m"],list_money_pattern["front_m"]))
|
|
|
set_begin = set()
|
|
|
# for pattern_key in list_money_pattern.keys():
|
|
@@ -1641,8 +1649,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
|
# index = 0
|
|
|
# for i in range(len(all_match)):
|
|
|
# if len(all_match[i][0])>0:
|
|
|
- # print("===",all_match[i])
|
|
|
- # #print(all_match[i][0])
|
|
|
+ # # print("===",all_match[i])
|
|
|
+ # ## print(all_match[i][0])
|
|
|
# unit = ""
|
|
|
# entity_text = all_match[i][3]
|
|
|
# if pattern_key in ["key_word","front_m"]:
|
|
@@ -1689,18 +1697,32 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
|
#
|
|
|
# else:
|
|
|
# index += 1
|
|
|
- all_match = re.finditer(pattern_money, sentence_text)
|
|
|
+
|
|
|
+ # if re.search('评标结果|候选人公示', sentence_text):
|
|
|
+ # found_pingbiao = True
|
|
|
+ if re.search('业绩', sentence_text):
|
|
|
+ found_yeji += 1
|
|
|
+ if found_yeji >= 2: # 过滤掉业绩后面的所有金额
|
|
|
+ all_match = []
|
|
|
+ else:
|
|
|
+ all_match = re.finditer(pattern_money, sentence_text)
|
|
|
index = 0
|
|
|
for _match in all_match:
|
|
|
if len(_match.group())>0:
|
|
|
# print("===",_match.group())
|
|
|
- # print(_match.groupdict())
|
|
|
+ # # print(_match.groupdict())
|
|
|
+ notes = '' # 2021/7/20 新增备注金额大写或金额单位 if 金额大写 notes=大写 elif 单位 notes=单位
|
|
|
unit = ""
|
|
|
entity_text = ""
|
|
|
text_beforeMoney = ""
|
|
|
filter = ""
|
|
|
filter_unit = False
|
|
|
notSure = False
|
|
|
+ if re.search('业绩', sentence_text[:_match.span()[0]]): # 2021/7/21过滤掉业绩后面金额
|
|
|
+ # print('金额在业绩后面: ', _match.group(0))
|
|
|
+ found_yeji += 1
|
|
|
+ break
|
|
|
+
|
|
|
for k,v in _match.groupdict().items():
|
|
|
if v!="" and v is not None:
|
|
|
if k=='text_key_word':
|
|
@@ -1715,8 +1737,33 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
|
filter = v
|
|
|
if re.search("filter_unit",k) is not None:
|
|
|
filter_unit = True
|
|
|
+
|
|
|
+ if re.search('(^\d{2,},\d{4,}万?$)|(^\d{2,},\d{2}万?$)', entity_text.strip()): # 2021/7/19 修正OCR识别小数点为逗号
|
|
|
+ if re.search('[幢栋号楼层]', sentence_text[_match.span()[0]-2:_match.span()[0]]):
|
|
|
+ entity_text = re.sub('\d+,', '', entity_text)
|
|
|
+ else:
|
|
|
+ entity_text = entity_text.replace(',', '.')
|
|
|
+ # print(' 修正OCR识别小数点为逗号')
|
|
|
+
|
|
|
if entity_text.find("元")>=0:
|
|
|
unit = ""
|
|
|
+ if unit == "": #2021/7/21 有明显金额特征的补充单位,避免被过滤
|
|
|
+ if ('¥' in text_beforeMoney or '¥' in text_beforeMoney):
|
|
|
+ unit = '元'
|
|
|
+ # print('明显金额特征补充单位 元')
|
|
|
+ elif re.search('[单报标限]价|金额|价格[::]+$', text_beforeMoney.strip()) and \
|
|
|
+ re.search('\d{5,}',entity_text) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}',entity_text)==None:
|
|
|
+ unit = '元'
|
|
|
+ # print('明显金额特征补充单位 元')
|
|
|
+ elif re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7})$)|(^\d{,3}(,\d{3})+$)',entity_text):
|
|
|
+ unit = '元'
|
|
|
+ # print('明显金额特征补充单位 元')
|
|
|
+ if unit.find("万") >= 0 and entity_text.find("万") >= 0: #2021/7/19修改为金额文本有万,不计算单位
|
|
|
+ # print('修正金额及单位都有万, 金额:',entity_text, '单位:',unit)
|
|
|
+ unit = "元"
|
|
|
+ if re.search('.*万元万元', entity_text): #2021/7/19 修正两个万元
|
|
|
+ # print(' 修正两个万元',entity_text)
|
|
|
+ entity_text = entity_text.replace('万元万元','万元')
|
|
|
else:
|
|
|
if filter_unit:
|
|
|
continue
|
|
@@ -1742,15 +1789,36 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
|
entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
|
|
|
|
|
|
entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",entity_text)
|
|
|
+ # print('转换前金额:', entity_text, '单位:', unit)
|
|
|
+ if re.search('总投资', sentence_text[_match.span()[0] - 6:_match.span()[0]]): # 2021/8/5过滤掉总投资金额
|
|
|
+ # print('总投资金额: ', _match.group(0))
|
|
|
+ notes = '总投资'
|
|
|
+ if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆]', entity_text) != None:
|
|
|
+ notes = '大写'
|
|
|
+ # print("补充备注:notes = 大写")
|
|
|
+ elif re.search('单价', sentence_text[_match.span()[0]:_match.span()[1]]):
|
|
|
+ notes = '单价'
|
|
|
+ # print("补充备注:单价 ",sentence_text[_match.span()[0]-2:_match.span()[1]])
|
|
|
if len(unit)>0:
|
|
|
- entity_text = str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0]))
|
|
|
+ if unit.find('万')>=0 and len(entity_text.split('.')[0])>=8: # 2021/7/19 修正万元金额过大的情况
|
|
|
+ # print('修正单位万元金额过大的情况 金额:', entity_text, '单位:', unit)
|
|
|
+ entity_text = str(getUnifyMoney(entity_text) * getMultipleFactor(unit[0])/10000)
|
|
|
+ else:
|
|
|
+ # print('str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0])):')
|
|
|
+ entity_text = str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0]))
|
|
|
else:
|
|
|
- entity_text = str(getUnifyMoney(entity_text))
|
|
|
+ if entity_text.find('万')>=0 and entity_text.split('.')[0].isdigit() and len(entity_text.split('.')[0])>=8:
|
|
|
+ entity_text = str(getUnifyMoney(entity_text)/10000)
|
|
|
+ # print('修正金额字段含万 过大的情况')
|
|
|
+ else:
|
|
|
+ entity_text = str(getUnifyMoney(entity_text))
|
|
|
|
|
|
if float(entity_text)<100 or float(entity_text)>100000000000:
|
|
|
+ # print('过滤掉金额:float(entity_text)<100 or float(entity_text)>100000000000', entity_text, unit)
|
|
|
continue
|
|
|
|
|
|
if notSure and unit=="" and float(entity_text)>100*10000:
|
|
|
+ # print('过滤掉金额 notSure and unit=="" and float(entity_text)>100*10000:', entity_text, unit)
|
|
|
continue
|
|
|
|
|
|
_exists = False
|
|
@@ -1762,7 +1830,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
|
if not _exists:
|
|
|
if float(entity_text)>1:
|
|
|
list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,begin_index_temp,end_index_temp))
|
|
|
-
|
|
|
+ list_sentence_entitys[-1].notes = notes # 2021/7/20 新增金额备注
|
|
|
else:
|
|
|
index += 1
|
|
|
|
|
@@ -1824,7 +1892,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
|
end_index = changeIndexFromWordToWords(tokens, end_index_temp)
|
|
|
if begin_index is None or end_index is None:
|
|
|
continue
|
|
|
- print(begin_index_temp,end_index_temp,begin_index,end_index)
|
|
|
+ # print(begin_index_temp,end_index_temp,begin_index,end_index)
|
|
|
entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
|
|
|
entity_text = bidway['body']
|
|
|
list_sentence_entitys.append(
|
|
@@ -1949,7 +2017,7 @@ def getPredictTable():
|
|
|
df_data["docid"].append(item["docid"])
|
|
|
df_data["json_table"].append(item["json_table"])
|
|
|
except Exception as e:
|
|
|
- print(e)
|
|
|
+ # print(e)
|
|
|
break
|
|
|
df_1 = pd.DataFrame(df_data)
|
|
|
df_1.to_csv("../form/websource_67000_table.csv",columns=["docid","json_table"])
|
|
@@ -1965,7 +2033,7 @@ if __name__=="__main__":
|
|
|
f.write(segment(tableToText(BeautifulSoup(content,"lxml"))))
|
|
|
'''
|
|
|
# content = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
|
|
|
- # print(segment(tableToText(BeautifulSoup(content,"lxml"))))
|
|
|
+ # # print(segment(tableToText(BeautifulSoup(content,"lxml"))))
|
|
|
getPredictTable()
|
|
|
|
|
|
|