document.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573
  1. from BaseDataMaintenance.model.ots.BaseModel import BaseModel
  2. from tablestore import *
  3. from BaseDataMaintenance.common.Utils import *
  4. from bs4 import BeautifulSoup
  5. from BaseDataMaintenance.common.Utils import article_limit
  6. document_partitionkey = "partitionkey"
  7. document_docid = "docid"
  8. document_dochtmlcon = "dochtmlcon"
  9. document_doctextcon = "doctextcon"
  10. document_doctitle = "doctitle"
  11. document_attachmenttextcon = "attachmenttextcon"
  12. document_attachment_path = "page_attachments"
  13. document_attachment_path_filemd5 = "fileMd5"
  14. document_attachment_path_fileTitle = "fileTitle"
  15. document_attachment_path_fileLink = "fileLink"
  16. document_crtime = "crtime"
  17. document_status = "status"
  18. document_page_time = "page_time"
  19. document_attachment_extract_status = "attachment_extract_status"
  20. document_web_source_no = "web_source_no"
  21. document_fingerprint = "fingerprint"
  22. document_opertime = "opertime"
  23. document_docchannel = "docchannel"
  24. document_original_docchannel = "original_docchannel"
  25. document_life_docchannel = "life_docchannel"
  26. document_area = "area"
  27. document_province = "province"
  28. document_city = "city"
  29. document_district = "district"
  30. document_extract_json = "extract_json"
  31. document_bidway = "bidway"
  32. document_industry = "industry"
  33. document_info_type = "info_type"
  34. document_qcodes = "qcodes"
  35. document_project_name = "project_name"
  36. document_project_code = "project_code"
  37. document_project_codes = "project_codes"
  38. document_tenderee = "tenderee"
  39. document_tenderee_addr = "tenderee_addr"
  40. document_tenderee_phone = "tenderee_phone"
  41. document_tenderee_contact = "tenderee_contact"
  42. document_agency = "agency"
  43. document_agency_phone = "agency_phone"
  44. document_agency_contact = "agency_contact"
  45. document_product = "product"
  46. document_moneysource = "moneysource"
  47. document_service_time = "service_time"
  48. document_time_bidclose = "time_bidclose"
  49. document_time_bidopen = "time_bidopen"
  50. document_time_bidstart = "time_bidstart"
  51. document_time_commencement = "time_commencement"
  52. document_time_completion = "time_completion"
  53. document_time_earnest_money_start = "time_earnest_money_start"
  54. document_time_earnest_money_end = "time_earnest_money_end"
  55. document_time_get_file_end = "time_get_file_end"
  56. document_time_get_file_start = "time_get_file_start"
  57. document_time_publicity_end = "time_publicity_end"
  58. document_time_publicity_start = "time_publicity_start"
  59. document_time_registration_end = "time_registration_end"
  60. document_time_registration_start = "time_registration_start"
  61. document_time_release = "time_release"
  62. document_info_source = "info_source"
  63. document_nlp_enterprise = "nlp_enterprise"
  64. document_nlp_enterprise_attachment = "nlp_enterprise_attachment"
  65. document_total_tenderee_money = "total_tenderee_money"
  66. class Document(BaseModel):
  67. def __init__(self,_dict):
  68. BaseModel.__init__(self)
  69. for k,v in _dict.items():
  70. self.setValue(k,v,True)
  71. self.table_name = "document"
  72. self.prefixs = ["www.bidizhaobiao.com","bxkc.oss-cn-shanghai.aliyuncs.com"]
  73. def getPrimary_keys(self):
  74. return ["partitionkey","docid"]
  75. # def delete_row(self,ots_client):
  76. # raise NotImplementedError()
  77. def isLegalUrl(self,_url,_type):
  78. _flag = False
  79. for _prefix in self.prefixs:
  80. if _url.find(_prefix)>=0:
  81. _flag = True
  82. if _type==0:
  83. if _flag:
  84. return True
  85. else:
  86. return False
  87. else:
  88. if _flag:
  89. return False
  90. else:
  91. return True
  92. def fromInitialed(self):
  93. self.setValue(document_status,random.randint(1,50),True)
  94. def fromEas2Maxcompute(self):
  95. self.setValue(document_status,random.randint(151,170),True)
  96. def fromEasFailed(self):
  97. self.setValue(document_status,random.randint(51,60),True)
  98. def fromEas2Extract(self):
  99. self.setValue(document_status,random.randint(61,70),True)
  100. def updateSWFImages(self,swf_urls):
  101. if len(swf_urls)>0:
  102. _dochtmlcon = self.getProperties().get(document_dochtmlcon)
  103. _soup = BeautifulSoup(_dochtmlcon,"lxml")
  104. if _soup.find("img",{"src":swf_urls[0]}) is None:
  105. _div = '<div class="swf_urls">'
  106. for _url in swf_urls:
  107. _div += '<p><img src="%s"/></p>'%(_url)
  108. _div += "</div>"
  109. _dochtmlcon += _div
  110. self.setValue(document_dochtmlcon,_dochtmlcon,True)
  111. def getRichTextFetch(self,list_html):
  112. _text = ""
  113. for _ht in list_html:
  114. if isinstance(_ht,str):
  115. _text += "<div>%s</div>"%(_ht)
  116. elif isinstance(_ht,dict):
  117. _filemd5 = _ht.get("filemd5","")
  118. _html = _ht.get("html","")
  119. _text += '<div filemd5="%s">%s</div>'%(_filemd5,_html)
  120. if len(_text)>50000:
  121. _soup = BeautifulSoup(_text,"lxml")
  122. _soup = article_limit(_soup,50000)
  123. _text = re.sub("<html>|</html>|<body>|</body>","",str(_soup))
  124. return _text
  125. def updateAttachment(self,list_html):
  126. if len(list_html)>0:
  127. _dochtmlcon = self.getProperties().get(document_dochtmlcon,"")
  128. _dochtmlcon = re.sub("<html>|</html>|<body>|</body>","",_dochtmlcon)
  129. _dochtmlcon_len = len(bytes(_dochtmlcon,encoding="utf8"))
  130. fix_len = self.COLUMN_MAX_SIZE-_dochtmlcon_len-100
  131. # _text = '\n<div style="display:none;" class="richTextFetch">%s</div>'%("\n".join(list_html))
  132. _text = '<div style="display:none;" class="richTextFetch">%s</div>'%(self.getRichTextFetch(list_html))
  133. if _dochtmlcon is not None:
  134. _soup = BeautifulSoup(_dochtmlcon,"lxml")
  135. _node = _soup.find("div",attrs={"class":"richTextFetch"})
  136. if _node is not None:
  137. _node.decompose()
  138. self.setValue(document_dochtmlcon,str(_soup)+_text,True)
  139. def getTitleFromHtml(self,filemd5,_html):
  140. _soup = BeautifulSoup(_html,"lxml")
  141. _find = _soup.find("a",attrs={"data":filemd5})
  142. _title = ""
  143. if _find is not None:
  144. _title = _find.get_text()
  145. return _title
  146. def getSourceLinkFromHtml(self,filemd5,_html):
  147. _soup = BeautifulSoup(_html,"lxml")
  148. _find = _soup.find("a",attrs={"filelink":filemd5})
  149. filelink = ""
  150. if _find is None:
  151. _find = _soup.find("img",attrs={"filelink":filemd5})
  152. if _find is not None:
  153. filelink = _find.attrs.get("src","")
  154. else:
  155. filelink = _find.attrs.get("href","")
  156. return filelink
  157. import random
  158. def turn_extract_status():
  159. from BaseDataMaintenance.dataSource.source import getConnect_ots
  160. from BaseDataMaintenance.common.multiThread import MultiThreadHandler
  161. import queue
  162. from threading import Thread
  163. import json
  164. task_queue = queue.Queue()
  165. from BaseDataMaintenance.model.ots.attachment import attachment_filemd5,attachment_file_title,attachment_file_link
  166. ots_client = getConnect_ots()
  167. def producer(task_queue,ots_client):
  168. bool_query = BoolQuery(must_queries=[
  169. # WildcardQuery(document_web_source_no,"00295*"),
  170. # RangeQuery(document_crtime,"2021-07-26 00:00:00"),
  171. RangeQuery(document_status,61,70,True,True),
  172. #TermQuery(document_docid,171146519),
  173. ]
  174. )
  175. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  176. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
  177. columns_to_get=ColumnsToGet([document_fingerprint],return_type=ColumnReturnType.SPECIFIED))
  178. list_data = getRow_ots(rows)
  179. print(total_count)
  180. _count = len(list_data)
  181. for _data in list_data:
  182. _document = Document(_data)
  183. task_queue.put(_document)
  184. while next_token:
  185. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  186. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  187. columns_to_get=ColumnsToGet([document_fingerprint],return_type=ColumnReturnType.SPECIFIED))
  188. list_data = getRow_ots(rows)
  189. _count += len(list_data)
  190. print("%d/%d"%(_count,total_count))
  191. for _data in list_data:
  192. _document = Document(_data)
  193. task_queue.put(_document)
  194. def _handle(item,result_queue,ots_client):
  195. #change attach value
  196. # list_attachment = json.loads(item.getProperties().get(document_attachment_path))
  197. # print("docid",item.getProperties().get(document_docid))
  198. # for attach in list_attachment:
  199. #
  200. # filemd5 = attach.get(document_attachment_path_filemd5,"")
  201. # _document_html = item.getProperties().get(document_dochtmlcon,"")
  202. #
  203. # _file_title = item.getTitleFromHtml(filemd5,_document_html)
  204. # filelink = item.getSourceLinkFromHtml(filemd5,_document_html)
  205. # attach[document_attachment_path_fileTitle] = _file_title
  206. # attach[document_attachment_path_fileLink] = filelink
  207. #
  208. # item.setValue(document_attachment_path,json.dumps(list_attachment,ensure_ascii=False),True)
  209. # item.all_columns.remove(document_dochtmlcon)
  210. #change status
  211. item.setValue(document_status,random.randint(1,50),True)
  212. item.update_row(ots_client)
  213. t_producer = Thread(target=producer,kwargs={"task_queue":task_queue,"ots_client":ots_client})
  214. t_producer.start()
  215. t_producer.join()
  216. # mt = MultiThreadHandler(task_queue,_handle,None,30,ots_client=ots_client)
  217. # mt.run()
  218. dict_fingerprint = {}
  219. while True:
  220. try:
  221. item = task_queue.get(timeout=2)
  222. fingerprint = item.getProperties().get(document_fingerprint)
  223. if fingerprint is not None:
  224. if fingerprint not in dict_fingerprint:
  225. dict_fingerprint[fingerprint] = []
  226. dict_fingerprint[fingerprint].append(item)
  227. except Exception as e:
  228. print(e)
  229. break
  230. print(len(dict_fingerprint.keys()))
  231. status_queue = queue.Queue()
  232. for k,v in dict_fingerprint.items():
  233. print("key",k,len(v))
  234. v.sort(key=lambda x:x.docid)
  235. for _d in v[1:]:
  236. _d.setValue(document_status,random.randint(401,450),True)
  237. status_queue.put(_d)
  238. mt = MultiThreadHandler(status_queue,_handle,None,30,ots_client=ots_client)
  239. mt.run()
  240. def turn_document_status():
  241. from BaseDataMaintenance.dataSource.source import getConnect_ots
  242. from BaseDataMaintenance.common.multiThread import MultiThreadHandler
  243. import queue
  244. from threading import Thread
  245. import json
  246. task_queue = queue.Queue()
  247. from BaseDataMaintenance.model.ots.attachment import attachment_filemd5,attachment_file_title,attachment_file_link
  248. ots_client = getConnect_ots()
  249. def producer(task_queue,ots_client):
  250. bool_query = BoolQuery(
  251. must_queries=[
  252. MatchPhraseQuery("doctitle","珠海城市职业技术学院2022年05月至2022年06月政府采购意向"),
  253. # BoolQuery(should_queries=[
  254. # # TermQuery("tenderee","山西利民工业有限责任公司"),
  255. # # MatchPhraseQuery("doctitle","中国电信"),
  256. # # MatchPhraseQuery("doctextcon","中国电信"),
  257. # # MatchPhraseQuery("attachmenttextcon","中国电信")]),
  258. # # RangeQuery(document_status,88,120,True,True),
  259. # RangeQuery("page_time","2022-03-24","2022-03-25",True,False),
  260. # ExistsQuery
  261. # #,TermQuery(document_docid,171146519)
  262. # ]
  263. # )
  264. ],
  265. # must_not_queries=[WildcardQuery("DX004354*")]
  266. )
  267. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  268. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
  269. columns_to_get=ColumnsToGet([document_area],return_type=ColumnReturnType.SPECIFIED))
  270. list_data = getRow_ots(rows)
  271. print(total_count)
  272. _count = len(list_data)
  273. for _data in list_data:
  274. _document = Document(_data)
  275. task_queue.put(_document)
  276. while next_token:
  277. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  278. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  279. columns_to_get=ColumnsToGet([document_area],return_type=ColumnReturnType.SPECIFIED))
  280. list_data = getRow_ots(rows)
  281. _count += len(list_data)
  282. print("%d/%d"%(_count,total_count))
  283. for _data in list_data:
  284. _document = Document(_data)
  285. task_queue.put(_document)
  286. # docids = [223820830,224445409]
  287. # for docid in docids:
  288. # _dict = {document_docid:int(docid),
  289. # document_partitionkey:int(docid)%500+1,
  290. # }
  291. # task_queue.put(Document(_dict))
  292. # import pandas as pd
  293. # df = pd.read_excel("2022-01-19_214304_export11.xlsx")
  294. # for docid,tenderee,win in zip(df["docid"],df["招标单位"],df["中标单位"]):
  295. # if not isinstance(tenderee,(str)) or not isinstance(win,(str)) or win=="" or tenderee=="":
  296. # # print(docid)
  297. # _dict = {document_docid:int(docid),
  298. # document_partitionkey:int(docid)%500+1,
  299. # }
  300. # task_queue.put(Document(_dict))
  301. log("task_queue size:%d"%(task_queue.qsize()))
  302. def _handle(item,result_queue,ots_client):
  303. #change attach value
  304. # list_attachment = json.loads(item.getProperties().get(document_attachment_path))
  305. # print("docid",item.getProperties().get(document_docid))
  306. # for attach in list_attachment:
  307. #
  308. # filemd5 = attach.get(document_attachment_path_filemd5,"")
  309. # _document_html = item.getProperties().get(document_dochtmlcon,"")
  310. #
  311. # _file_title = item.getTitleFromHtml(filemd5,_document_html)
  312. # filelink = item.getSourceLinkFromHtml(filemd5,_document_html)
  313. # attach[document_attachment_path_fileTitle] = _file_title
  314. # attach[document_attachment_path_fileLink] = filelink
  315. #
  316. # item.setValue(document_attachment_path,json.dumps(list_attachment,ensure_ascii=False),True)
  317. # item.all_columns.remove(document_dochtmlcon)
  318. #change status
  319. # item.setValue(document_docchannel,item.getProperties().get(document_original_docchannel),True)
  320. # item.setValue(document_status,random.randint(151,171),True)
  321. item.setValue(document_area,"华南",True)
  322. item.setValue(document_province,"广东",True)
  323. item.setValue(document_city,"珠海",True)
  324. item.setValue(document_district,"金湾区",True)
  325. item.update_row(ots_client)
  326. log("update %d status done"%(item.getProperties().get(document_docid)))
  327. pass
  328. t_producer = Thread(target=producer,kwargs={"task_queue":task_queue,"ots_client":ots_client})
  329. t_producer.start()
  330. t_producer.join()
  331. mt = MultiThreadHandler(task_queue,_handle,None,30,ots_client=ots_client)
  332. mt.run()
  333. def drop_extract2():
  334. from BaseDataMaintenance.dataSource.source import getConnect_ots
  335. from BaseDataMaintenance.common.multiThread import MultiThreadHandler
  336. import queue
  337. from threading import Thread
  338. import json
  339. task_queue = queue.Queue()
  340. from BaseDataMaintenance.model.ots.attachment import attachment_filemd5,attachment_file_title,attachment_file_link
  341. ots_client = getConnect_ots()
  342. from BaseDataMaintenance.model.ots.document_extract2 import Document_extract2
  343. def producer(task_queue,ots_client):
  344. bool_query = BoolQuery(must_queries=[
  345. BoolQuery(should_queries=[
  346. # TermQuery("tenderee","山西利民工业有限责任公司"),
  347. # MatchPhraseQuery("doctitle","中国电信"),
  348. # MatchPhraseQuery("doctextcon","中国电信"),
  349. # MatchPhraseQuery("attachmenttextcon","中国电信")]),
  350. RangeQuery("status",1,1000,True,True),
  351. # RangeQuery("page_time","2021-12-20","2022-01-05",True,False),
  352. #,TermQuery(document_docid,171146519)
  353. ]
  354. ),
  355. # TermQuery("docid",228359000)
  356. ],
  357. # must_not_queries=[NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*"))]
  358. )
  359. rows,next_token,total_count,is_all_succeed = ots_client.search("document_extract2","document_extract2_index",
  360. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
  361. columns_to_get=ColumnsToGet(["status"],return_type=ColumnReturnType.SPECIFIED))
  362. list_data = getRow_ots(rows)
  363. print(total_count)
  364. _count = len(list_data)
  365. for _data in list_data:
  366. task_queue.put(_data)
  367. while next_token:
  368. rows,next_token,total_count,is_all_succeed = ots_client.search("document_extract2","document_extract2_index",
  369. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  370. columns_to_get=ColumnsToGet(["status"],return_type=ColumnReturnType.SPECIFIED))
  371. list_data = getRow_ots(rows)
  372. _count += len(list_data)
  373. print("%d/%d"%(_count,total_count))
  374. for _data in list_data:
  375. task_queue.put(_data)
  376. # docids = [223820830,224445409]
  377. # for docid in docids:
  378. # _dict = {document_docid:int(docid),
  379. # document_partitionkey:int(docid)%500+1,
  380. # }
  381. # task_queue.put(Document(_dict))
  382. # import pandas as pd
  383. # df = pd.read_excel("2022-01-19_214304_export11.xlsx")
  384. # for docid,tenderee,win in zip(df["docid"],df["招标单位"],df["中标单位"]):
  385. # if not isinstance(tenderee,(str)) or not isinstance(win,(str)) or win=="" or tenderee=="":
  386. # # print(docid)
  387. # _dict = {document_docid:int(docid),
  388. # document_partitionkey:int(docid)%500+1,
  389. # }
  390. # task_queue.put(Document(_dict))
  391. log("task_queue size:%d"%(task_queue.qsize()))
  392. def _handle(item,result_queue,ots_client):
  393. #change attach value
  394. # list_attachment = json.loads(item.getProperties().get(document_attachment_path))
  395. # print("docid",item.getProperties().get(document_docid))
  396. # for attach in list_attachment:
  397. #
  398. # filemd5 = attach.get(document_attachment_path_filemd5,"")
  399. # _document_html = item.getProperties().get(document_dochtmlcon,"")
  400. #
  401. # _file_title = item.getTitleFromHtml(filemd5,_document_html)
  402. # filelink = item.getSourceLinkFromHtml(filemd5,_document_html)
  403. # attach[document_attachment_path_fileTitle] = _file_title
  404. # attach[document_attachment_path_fileLink] = filelink
  405. #
  406. # item.setValue(document_attachment_path,json.dumps(list_attachment,ensure_ascii=False),True)
  407. # item.all_columns.remove(document_dochtmlcon)
  408. #change status
  409. # item.setValue(document_docchannel,item.getProperties().get(document_original_docchannel),True)
  410. # item.setValue(document_status,random.randint(151,170),True)
  411. # item.update_row(ots_client)
  412. # log("update %d status done"%(item.getProperties().get(document_docid)))
  413. _dict = {}
  414. _dict.update(item)
  415. _dict.pop("status")
  416. _dict["status"] = 1
  417. print(_dict)
  418. _document = Document(_dict)
  419. _document.update_row(ots_client)
  420. _d_extract = Document_extract2(_dict)
  421. _d_extract.delete_row(ots_client)
  422. pass
  423. t_producer = Thread(target=producer,kwargs={"task_queue":task_queue,"ots_client":ots_client})
  424. t_producer.start()
  425. t_producer.join()
  426. mt = MultiThreadHandler(task_queue,_handle,None,30,ots_client=ots_client)
  427. mt.run()
  428. def fixDocumentHtml():
  429. from BaseDataMaintenance.dataSource.source import getConnect_ots,getConnect_ots_capacity
  430. from queue import Queue
  431. ots_client = getConnect_ots()
  432. from BaseDataMaintenance.common.multiThread import MultiThreadHandler
  433. from BaseDataMaintenance.model.ots.document_html import Document_html
  434. capacity_client = getConnect_ots_capacity()
  435. list_data = []
  436. bool_query = BoolQuery(must_queries=[
  437. MatchPhraseQuery("doctextcon","信友-城市之光"),
  438. MatchPhraseQuery("doctextcon","Copyright"),
  439. # TermQuery("docid",254249505)
  440. ])
  441. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  442. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),get_total_count=True,limit=100),
  443. columns_to_get=ColumnsToGet(["doctextcon"],return_type=ColumnReturnType.SPECIFIED))
  444. print("total_count",total_count)
  445. list_data.extend(getRow_ots(rows))
  446. while next_token:
  447. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  448. SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
  449. columns_to_get=ColumnsToGet(["doctextcon"],return_type=ColumnReturnType.SPECIFIED))
  450. list_data.extend(getRow_ots(rows))
  451. task_queue = Queue()
  452. for _data in list_data:
  453. task_queue.put(_data)
  454. _pattern = "(?P<_find>城市之光.*Ltd.)"
  455. _pattern1 = "(?P<_find>Evaluation.*Ltd.)"
  456. def _handle(item,result_queue):
  457. _doctextcon = item.get("doctextcon")
  458. _search = re.search(_pattern,_doctextcon)
  459. print(_search.groupdict().get("_find"))
  460. item["doctextcon"] = re.sub(_pattern,"",_doctextcon)
  461. _d = Document(item)
  462. _d.update_row(ots_client)
  463. _d1 = {"partitionkey":item.get("partitionkey"),
  464. "docid":item.get("docid")}
  465. _dh = Document(_d1)
  466. _dh.fix_columns(capacity_client,["dochtmlcon"],True)
  467. _dochtmlcon = _dh.getProperties().get("dochtmlcon")
  468. _dochtmlcon = re.sub("\n","",_dochtmlcon)
  469. _search = re.search(_pattern1,_dochtmlcon)
  470. _dochtmlcon = re.sub(_pattern1,"",_dochtmlcon)
  471. _d1["dochtmlcon"] = _dochtmlcon
  472. _dh = Document(_d1)
  473. _dh.update_row(capacity_client)
  474. # print(re.sub(_pattern,"</div><p><span>",_dochtmlcon))
  475. mt = MultiThreadHandler(task_queue,_handle,None,2)
  476. mt.run()
  477. if __name__=="__main__":
  478. # turn_extract_status()
  479. # turn_document_status()
  480. # drop_extract2()
  481. # fixDocumentHtml()
  482. from BaseDataMaintenance.dataSource.source import getConnect_ots
  483. from BaseDataMaintenance.dataSource.source import getConnect_ots_capacity
  484. ots_client = getConnect_ots()
  485. ots_capacity = getConnect_ots_capacity()
  486. import pandas as pd
  487. df = pd.read_excel("2022-10-14_190838_数据导出.xlsx")
  488. _count = 0
  489. for _docid in df["docid"]:
  490. partitionkey = int(_docid)//500+1
  491. _d = {document_partitionkey:partitionkey,
  492. document_docid:int(_docid)}
  493. _doc = Document(_d)
  494. _doc.delete_row(ots_client)
  495. _doc.delete_row(ots_capacity)
  496. _count += 1
  497. print(_docid)
  498. print("delete count:%d"%_count)