extract.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465
  1. '''
  2. Created on 2019年1月4日
  3. @author: User
  4. '''
  5. import os
  6. from bs4 import BeautifulSoup, Comment
  7. import copy
  8. import re
  9. import sys
  10. import os
  11. import codecs
  12. import requests
  13. import time
  14. from unicodedata import normalize
  15. _time1 = time.time()
  16. sys.path.append(os.path.abspath("../.."))
  17. from BiddingKG.dl.common.Utils import *
  18. import BiddingKG.dl.entityLink.entityLink as entityLink
  19. import BiddingKG.dl.interface.predictor as predictor
  20. import BiddingKG.dl.interface.Preprocessing as Preprocessing
  21. import BiddingKG.dl.interface.getAttributes as getAttributes
  22. import BiddingKG.dl.complaint.punish_predictor as punish_rule
  23. import json
  24. from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
  25. from BiddingKG.dl.ratio.re_ratio import extract_ratio
  26. # 自定义jsonEncoder
  27. class MyEncoder(json.JSONEncoder):
  28. def default(self, obj):
  29. if isinstance(obj, np.ndarray):
  30. return obj.tolist()
  31. elif isinstance(obj, bytes):
  32. return str(obj, encoding='utf-8')
  33. elif isinstance(obj, (np.float_, np.float16, np.float32,
  34. np.float64)):
  35. return float(obj)
  36. elif isinstance(obj,str):
  37. return obj
  38. return json.JSONEncoder.default(self, obj)
  39. def extractCount(extract_dict):
  40. # time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  41. if len(extract_dict):
  42. _extract = extract_dict
  43. else:
  44. _extract = {}
  45. # print(_extract)
  46. dict_pack = _extract.get("prem",{})
  47. extract_count = 0
  48. list_code = _extract.get("code",[])
  49. if len(list_code)>0:
  50. project_code = list_code[0]
  51. else:
  52. project_code = ""
  53. project_name = _extract.get("name","")
  54. bidding_budget = ""
  55. win_tenderer = ""
  56. win_bid_price = ""
  57. linklist_count = 0
  58. for _key in dict_pack.keys():
  59. if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
  60. extract_count += 1
  61. if bidding_budget=="":
  62. bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
  63. for _role in dict_pack[_key]["roleList"]:
  64. if isinstance(_role,list):
  65. extract_count += 1
  66. if _role[2]!='' and float(_role[2])>0:
  67. extract_count += 1
  68. if _role[0]=="tenderee":
  69. tenderee = _role[1]
  70. if _role[0]=="win_tenderer":
  71. if win_tenderer=="":
  72. win_tenderer = _role[1]
  73. if _role[2]!='' and float(_role[2])>0:
  74. extract_count += 1
  75. if win_bid_price=="":
  76. win_bid_price = str(float(_role[2]))
  77. if _role[0]=="agency":
  78. agency = _role[1]
  79. if isinstance(_role,dict):
  80. extract_count += 1
  81. if "role_money" in _role:
  82. if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
  83. extract_count += 1
  84. if _role.get("role_name")=="tenderee":
  85. tenderee = _role["role_text"]
  86. if _role.get("role_name")=="win_tenderer":
  87. if win_tenderer=="":
  88. win_tenderer = _role["role_text"]
  89. if "role_money" in _role:
  90. if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
  91. extract_count += 1
  92. if win_bid_price=="":
  93. win_bid_price = str(float(_role["role_money"]["money"]))
  94. if _role["role_name"]=="agency":
  95. agency = _role["role_text"]
  96. linklist = _role.get("linklist",[])
  97. for link in linklist:
  98. for l in link:
  99. if l!="":
  100. linklist_count += 1
  101. extract_count += linklist_count//2
  102. if project_code!="":
  103. extract_count += 1
  104. if project_name!="":
  105. extract_count += 1
  106. return extract_count
  107. # 字符编码标准化
  108. def str_normalize(text):
  109. # time1 = time.time()
  110. cn_punctuation = "¥,。:;{}!?()<"
  111. text_split = re.split("([{}])+".format(cn_punctuation),text)
  112. # print(text_split)
  113. new_text = ""
  114. for s in text_split:
  115. if re.search("^[{}]+$".format(cn_punctuation),s):
  116. new_text += s
  117. else:
  118. new_text += normalize('NFKD', s)
  119. # print("str_normalize cost time %s"%str(time.time()-time1))
  120. # print(new_text)
  121. return new_text
  122. # 修复prem中地区前缀不完整实体
  123. def repair_entity(prem,district_dict,list_articles):
  124. district_dict = district_dict['district']
  125. province = district_dict['province'] if district_dict['province'] and district_dict['province'] not in ['未知','全国'] else ""
  126. city = district_dict['city'] if district_dict['city'] and district_dict['city']!='未知' else ""
  127. district = district_dict['district'] if district_dict['district'] and district_dict['district']!='未知' else ""
  128. content_text = list_articles[0].content
  129. autonomous_region_dict = {
  130. "新疆":"新疆维吾尔",
  131. "西藏":"西藏",
  132. "内蒙古":"内蒙古",
  133. "广西":"广西壮族",
  134. "宁夏":"宁夏回族"
  135. }
  136. for package,_prem in prem[0]['prem'].items():
  137. for role in _prem['roleList']:
  138. if role['role_name'] in ['tenderee','agency']:
  139. role_text = role['role_text']
  140. if re.search("^[省市县区]",role_text):
  141. if role_text[0]=='省' and role_text[:2] not in ['省道']:
  142. role['role_text'] = province + role_text
  143. elif role_text[0]=='市' and role_text[:2] not in ['市政','市场']:
  144. if district+'市' in content_text:
  145. # 县级市
  146. role['role_text'] = district + role_text
  147. else:
  148. role['role_text'] = city + role_text
  149. elif role_text[0] in ['县','区']:
  150. role['role_text'] = district + role_text
  151. elif re.search("^自治[区州县]",role_text):
  152. if role_text[:3]=='自治区':
  153. role['role_text'] = autonomous_region_dict.get(province,"") + role_text
  154. elif role_text[:3] in ['自治县',"自治州"]:
  155. if re.search("自治[县州]?$",district):
  156. role['role_text'] = re.sub("自治[县州]?","",district) + role_text
  157. elif re.search("族$",district):
  158. role['role_text'] = district + role_text
  159. elif re.search("自治[县州]?$",city):
  160. role['role_text'] = re.sub("自治[县州]?","",city) + role_text
  161. elif re.search("族$",city):
  162. role['role_text'] = city + role_text
  163. def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',**kwargs):
  164. cost_time = dict()
  165. start_time = time.time()
  166. log("start process doc %s"%(str(doc_id)))
  167. # 字符编码标准化
  168. text = str_normalize(text)
  169. list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time, web_source_no]],useselffool=True)
  170. log("get preprocessed done of doc_id%s"%(doc_id))
  171. cost_time["preprocess"] = round(time.time()-start_time,2)
  172. cost_time.update(_cost_time)
  173. # 过滤掉Redis里值为0的错误实体
  174. # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0])
  175. # #依赖句子顺序
  176. # start_time = time.time() # 公告类型/生命周期提取 此处作废 换到后面预测 2022/4/29
  177. # channel_dic = predictor.getPredictor("channel").predict(title=title, list_sentence=list_sentences[0],
  178. # web_source_no=web_source_no,original_docchannel=original_docchannel)
  179. # cost_time["channel"] = round(time.time()-start_time,2)
  180. start_time = time.time() # 项目编号、名称提取
  181. codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
  182. log("get codename done of doc_id%s"%(doc_id))
  183. cost_time["codename"] = round(time.time()-start_time,2)
  184. start_time = time.time() # 角色金额模型提取
  185. predictor.getPredictor("prem").predict(list_sentences,list_entitys)
  186. log("get prem done of doc_id%s"%(doc_id))
  187. cost_time["prem"] = round(time.time()-start_time,2)
  188. # roles_l = get_role_context(doc_id, list_sentences, list_entitys)
  189. # return roles_l
  190. # start_time = time.time() # 产品名称及废标原因提取 此处作废 换到后面预测 2022/4/29
  191. # fail = channel_dic['docchannel']['docchannel'] == "废标公告"
  192. # fail_reason = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类
  193. # # predictor.getPredictor("product").predict(list_sentences, list_entitys)
  194. # log("get product done of doc_id%s"%(doc_id))
  195. # cost_time["product"] = round(time.time()-start_time,2)
  196. start_time = time.time() # 产品相关要素正则提取 单价、数量、品牌规格 ; 项目、需求、预算、时间
  197. product_attrs, total_product_money = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
  198. log("get product attributes done of doc_id%s"%(doc_id))
  199. cost_time["product_attrs"] = round(time.time()-start_time,2)
  200. start_time = time.time() #正则角色提取
  201. predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
  202. cost_time["rule"] = round(time.time()-start_time,2)
  203. '''正则补充最后一句实体日期格式为招标或代理 2021/12/30;正则最后补充角色及去掉包含 公共资源交易中心 的招标人'''
  204. start_time = time.time() #正则角色提取
  205. predictor.getPredictor("roleRuleFinal").predict(list_articles,list_sentences,list_entitys, codeName)
  206. cost_time["roleRuleFinal"] = round(time.time()-start_time,2)
  207. start_time = time.time() #正则招标人召回
  208. predictor.getPredictor("tendereeRuleRecall").predict(list_articles,list_sentences,list_entitys, codeName)
  209. cost_time["tendereeRuleRecall"] = round(time.time()-start_time,2)
  210. '''规则调整角色概率'''
  211. start_time = time.time() #
  212. predictor.getPredictor("rolegrade").predict(list_sentences,list_entitys)
  213. cost_time["rolegrade"] = round(time.time()-start_time,2)
  214. '''规则调整金额概率'''
  215. start_time = time.time() #
  216. predictor.getPredictor("moneygrade").predict(list_sentences,list_entitys)
  217. cost_time["moneygrade"] = round(time.time()-start_time,2)
  218. start_time = time.time() #联系人模型提取
  219. predictor.getPredictor("epc").predict(list_sentences,list_entitys)
  220. log("get epc done of doc_id%s"%(doc_id))
  221. cost_time["person"] = round(time.time()-start_time,2)
  222. start_time = time.time() # 时间类别提取
  223. predictor.getPredictor("time").predict(list_sentences, list_entitys)
  224. log("get time done of doc_id%s"%(doc_id))
  225. cost_time["time"] = round(time.time()-start_time,2)
  226. start_time = time.time() # 保证金支付方式
  227. payment_way_dic = predictor.getPredictor("deposit_payment_way").predict(content=list_articles[0].content)
  228. cost_time["deposit"] = round(time.time()-start_time,2)
  229. # 需在getPredictor("prem").predict后 getAttributes.getPREMs 前 规则调整 监理|施工|设计|勘察类别公告的费用 为招标或中标金额
  230. predictor.getPredictor("prem").correct_money_by_rule(title, list_entitys, list_articles)
  231. # 2021-12-29新增:提取:总价,单价
  232. start_time = time.time() # 总价单价提取
  233. predictor.getPredictor("total_unit_money").predict(list_sentences, list_entitys)
  234. cost_time["total_unit_money"] = round(time.time()-start_time, 2)
  235. # 依赖句子顺序
  236. start_time = time.time() # 实体链接
  237. entityLink.link_entitys(list_entitys)
  238. doctitle_refine = entityLink.doctitle_refine(title)
  239. nlp_enterprise,nlp_enterprise_attachment = entityLink.get_nlp_enterprise(list_entitys[0])
  240. prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time)
  241. log("get attributes done of doc_id%s"%(doc_id))
  242. cost_time["attrs"] = round(time.time()-start_time,2)
  243. '''表格要素提取'''
  244. table_prem = predictor.getPredictor("tableprem").predict(text, nlp_enterprise, web_source_name)
  245. # print('表格提取中标人:', table_prem)
  246. # print('原提取角色:', prem[0]['prem'])
  247. if table_prem:
  248. getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=table_prem)
  249. '''候选人提取'''
  250. candidate_top3_prem, candidate_dic = predictor.getPredictor("candidate").predict(text, list_sentences, list_entitys, nlp_enterprise)
  251. # print('表格提取候选人:', candidate_top3_prem)
  252. getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=candidate_top3_prem)
  253. '''获取联合体信息'''
  254. getAttributes.get_win_joint(prem, list_entitys, list_sentences, list_articles)
  255. #暂时不执行
  256. # start_time = time.time() #失信数据要素提取
  257. # list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
  258. # cost_time["punish"] = round(time.time()-start_time,2)
  259. '''修正采购公告表格形式多种采购产品中标价格;中标金额小于所有产品总金额则改为总金额'''
  260. getAttributes.correct_rolemoney(prem, total_product_money, list_articles)
  261. '''修正channel预测类别为招标公告却有中标人及预测为中标信息却无中标关键词的类别''' # 依赖 prem
  262. start_time = time.time()
  263. # content = list_articles[0].content
  264. # channel_dic = predictor.getPredictor("channel").predict_rule(title, content, channel_dic, prem_dic=prem[0]['prem'])
  265. channel_dic, msc = predictor.getPredictor("channel").predict_merge(title,list_sentences[0], text, list_articles[0].bidway, prem[0], original_docchannel)
  266. # print('msc', msc)
  267. cost_time["rule_channel"] = round(time.time()-start_time,2)
  268. '''一包多中标人提取及所有金额提取'''
  269. all_moneys = getAttributes.get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences)
  270. start_time = time.time() # 产品名称及废标原因提取 #依赖 docchannel结果
  271. fail = channel_dic['docchannel']['docchannel'] == "废标公告"
  272. fail_reason, product_list = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类 #2022/7/29补充返回产品,方便行业分类调用
  273. # predictor.getPredictor("product").predict(list_sentences, list_entitys)
  274. log("get product done of doc_id%s"%(doc_id))
  275. cost_time["product"] = round(time.time()-start_time,2)
  276. prem[0].update(getAttributes.getOtherAttributes(list_entitys[0],page_time))
  277. '''更新单一来源招标公告中标角色为预中标'''
  278. getAttributes.fix_single_source(prem[0], channel_dic, original_docchannel)
  279. '''公告无表格格式时,采购意向预测''' #依赖 docchannel结果 依赖产品及prem
  280. '''把产品要素提取结果在项目名称的添加到 采购需求,预算时间,采购时间 要素中'''
  281. predictor.getPredictor("product_attrs").add_product_attrs(channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,product_list,codeName,prem,text,page_time)
  282. '''行业分类提取,需要用标题、项目名称、产品、及prem 里面的角色'''
  283. industry = predictor.getPredictor('industry').predict(title, project=codeName[0]['name'], product=','.join(product_list), prem=prem)
  284. '''地区获取'''
  285. start_time = time.time()
  286. district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys)
  287. cost_time["district"] = round(time.time() - start_time, 2)
  288. '''根据district提取结果修复实体'''
  289. repair_entity(prem,district,list_articles)
  290. # '''限制行业最高金额'''
  291. # getAttributes.limit_maximum_amount(prem, industry) # 20230703取消,改为整合所有要素后面纠正
  292. '''根据数据源最后召回招标人角色'''
  293. prem = predictor.getPredictor('websource_tenderee').get_websource_tenderee(web_source_no, prem)
  294. '''根据关键词表生成项目标签'''
  295. project_label = predictor.getPredictor('project_label').predict(title,product=','.join(product_list),project_name=codeName[0]['name'],prem=prem)
  296. # print(project_label)
  297. # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
  298. # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
  299. version_date = {'version_date': '2024-03-28'}
  300. data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys)
  301. '''最终检查修正招标、中标金额'''
  302. getAttributes.limit_maximum_amount(data_res, list_entitys[0])
  303. data_res["project_label"] = project_label
  304. data_res["doctitle_refine"] = doctitle_refine
  305. data_res["nlp_enterprise"] = nlp_enterprise
  306. data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment
  307. # 要素的个数
  308. data_res['extract_count'] = extractCount(data_res)
  309. # 是否有表格
  310. data_res['exist_table'] = 1 if re.search("<td",text) else 0
  311. data_res["cost_time"] = cost_time
  312. data_res["success"] = True
  313. # for _article in list_articles:
  314. # log(_article.content)
  315. #
  316. # for list_entity in list_entitys:
  317. # for _entity in list_entity:
  318. # log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
  319. # (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
  320. # str(_entity.begin_index),str(_entity.end_index)))
  321. _extract_json = json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
  322. _extract_json = _extract_json.replace("\x06", "").replace("\x05", "").replace("\x07", "")
  323. return _extract_json#, list_articles[0].content, get_ent_context(list_sentences, list_entitys)
  324. def test(name,content):
  325. user = {
  326. "content": content,
  327. "id":name
  328. }
  329. myheaders = {'Content-Type': 'application/json'}
  330. _resp = requests.post("http://192.168.2.102:15030" + '/article_extract', json=user, headers=myheaders, verify=True)
  331. resp_json = _resp.content.decode("utf-8")
  332. # print(resp_json)
  333. return resp_json
  334. def get_ent_context(list_sentences, list_entitys):
  335. rs_list = []
  336. sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
  337. for list_entity in list_entitys:
  338. for _entity in list_entity:
  339. if _entity.entity_type in ['org', 'company', 'money']:
  340. s = sentences[_entity.sentence_index].sentence_text
  341. b = _entity.wordOffset_begin
  342. e = _entity.wordOffset_end
  343. # print("%s %d %.4f; %s %s %s"%(_entity.entity_type, _entity.label, _entity.values[_entity.label], s[max(0, b-10):b], _entity.entity_text, s[e:e+10]))
  344. rs_list.append("%s %d %.4f; %s ## %s ## %s"%(_entity.entity_type, _entity.label, _entity.values[_entity.label], s[max(0, b-10):b], _entity.entity_text, s[e:e+10]))
  345. return '\n'.join(rs_list)
  346. def get_role_context(docid, list_sentences, list_entitys):
  347. rs_list = []
  348. sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
  349. for list_entity in list_entitys:
  350. for _entity in list_entity:
  351. if _entity.entity_type in ['org', 'company']:
  352. idx = _entity.entity_id
  353. sentence = sentences[_entity.sentence_index]
  354. # _span = spanWindow(tokens=sentence.tokens, begin_index=_entity.begin_index, end_index=_entity.end_index, size=20,
  355. # center_include=False, word_flag=True, text=_entity.entity_text)
  356. _span = get_context(sentence.sentence_text, _entity.wordOffset_begin, _entity.wordOffset_end, size=40, center_include=False)
  357. rs_list.append((docid,idx, _entity.entity_type, _entity.label, '%.4f'%_entity.values[_entity.label], _span[0],
  358. _entity.entity_text, _span[1]))
  359. return rs_list
  360. if __name__=="__main__":
  361. import pandas as pd
  362. t1 = time.time()
  363. # text = '中标人:广州中医药有限公司,招标人:广州市第一人民医院, 代理机构:希达招标代理有限公司。招标金额:100万元, 手续费:100元,总投资:1亿元。中标金额:50000元。合同金额:50000万元。'
  364. title = '打印机'
  365. # df = pd.read_excel('E:/公告金额/产品名称采购需求预算金额采购时间等要素公告.xlsx')
  366. # # df = pd.read_excel('E:/公告金额/产品数量单价.xlsx')
  367. # for i in range(30,50,1):
  368. # text = df.loc[i, 'dochtmlcon']
  369. # rs = json.loads(predict('', text, ''))
  370. # print(rs['demand_info'])
  371. # print(rs['product'])
  372. # print(rs['product_attrs'])
  373. # print(rs)
  374. with open('D:/html/2.html', 'r', encoding='utf-8') as f:
  375. text = f.read()
  376. t1 = time.time()
  377. print(predict('', text, title))
  378. t2 = time.time()
  379. print(predict('', text, title))
  380. t3 = time.time()
  381. print('第一次耗时:%.4f, 第二次耗时:%.4f'%(t2-t1, t3-t2))
  382. # print(predict('',text,title))
  383. # df = pd.read_excel('E:/大网站规则识别/大网站要素提取结果2.xlsx')[:]
  384. # df = pd.read_excel('/data/python/lsm/datas_biddingkg/大网站要素提取结果20211115_2.xlsx')[:]
  385. # new_prem = []
  386. # for i in range(len(df)):
  387. # i = 530
  388. # doc_id = df.loc[i, 'docid']
  389. # text = df.loc[i, 'html']
  390. # # title = df.loc[i, 'doctitle']
  391. # rs = predict(doc_id,text)
  392. # rs = json.loads(rs)
  393. # prem = json.dumps(rs['prem'], ensure_ascii=False)
  394. # # print(rs)
  395. # new_prem.append(prem)
  396. # print(prem)
  397. # break
  398. # df['new_prem'] = pd.Series(new_prem)
  399. # print('耗时:', time.time()-t1)
  400. # # df.to_excel('E:/大网站规则识别/大网站要素提取结果20211115.xlsx')
  401. # df.to_excel('/data/python/lsm/datas_biddingkg/大网站要素提取结果20211115.xlsx')
  402. # # pass