1.py 89 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130
  1. #coding:UTF8
  2. from odps.udf import annotate
  3. from odps.udf import BaseUDTF
  4. from odps.udf import BaseUDAF
  5. import re
  6. @annotate('string,string -> string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string')
  7. class f_decode_extract(BaseUDTF):
  8. def __init__(self):
  9. import logging
  10. import json
  11. import time,re
  12. global json,logging,time,re
  13. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  14. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  15. self.dict_channel = {"公告变更":51,
  16. "招标公告":52,
  17. "中标信息":101,
  18. "招标预告":102,
  19. "招标答疑":103,
  20. "资审结果":105,
  21. "法律法规":106,
  22. "新闻资讯":107,
  23. "采购意向":114,
  24. "拍卖出让":115,
  25. "土地矿产":116,
  26. "产权交易":117,
  27. "废标公告":118,
  28. "候选人公示":119,
  29. "合同公告":120}
  30. def process(self, extractjson,otherjson):
  31. if extractjson is not None:
  32. _extract = json.loads(extractjson)
  33. else:
  34. _extract = {}
  35. if otherjson is not None:
  36. _other = json.loads(otherjson)
  37. else:
  38. _other = {}
  39. project_code = ""
  40. project_name = ""
  41. tenderee = ""
  42. agency = ""
  43. win_tenderer = ""
  44. bidding_budget = ""
  45. win_bid_price = ""
  46. fingerprint = ""
  47. page_time_stamp = 0
  48. docchannel = 0
  49. extract_count = 0
  50. page_time = _other.get("pageTime",time.strftime('%Y-%m-%d',time.localtime()))
  51. doctitle = _other.get("doctitle","")
  52. doctitle_refine = re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', doctitle)
  53. area = _other.get("area","")
  54. province = _other.get("province","")
  55. city = _other.get("city","")
  56. district = _other.get("district","")
  57. web_source_no = _other.get("webSourceNo","")
  58. time_bidclose = _extract.get("time_bidclose")
  59. time_bidopen = _extract.get("time_bidopen")
  60. time_bidstart = _extract.get("time_bidstart")
  61. time_commencement = _extract.get("time_commencement")
  62. time_completion = _extract.get("time_completion")
  63. time_earnest_money_end = _extract.get("time_earnestMoneyEnd")
  64. time_earnest_money_start = _extract.get("time_earnestMoneyStart")
  65. time_get_file_end = _extract.get("time_getFileEnd")
  66. time_get_file_start = _extract.get("time_getFileStart")
  67. time_publicity_end = _extract.get("time_publicityEnd")
  68. time_publicity_start = _extract.get("time_publicityStart")
  69. time_registration_end = _extract.get("time_registrationEnd")
  70. time_registration_start = _extract.get("time_registrationStart")
  71. time_release = _extract.get("time_release")
  72. # docchannel = _other.get("docchannel",0)
  73. docchannel_name = _extract.get("docchannel",{}).get("docchannel")
  74. doctype_name = _extract.get("docchannel",{}).get("doctype")
  75. if doctype_name in ["法律法规","新闻资讯","拍卖出让","土地矿产"]:
  76. docchannel_name = doctype_name
  77. docchannel = self.dict_channel.get(docchannel_name,0)
  78. if re.search(self.time_pattern,page_time) is not None:
  79. try:
  80. timeArray = time.strptime(page_time[:11], "%Y-%m-%d")
  81. page_time_stamp = int(time.mktime(timeArray))
  82. except Exception as e:
  83. pass
  84. list_code = _extract.get("code",[])
  85. if len(list_code)>0:
  86. project_code = list_code[0]
  87. project_name = _extract.get("name","")
  88. fingerprint = _extract.get("fingerprint","")
  89. dict_pack = _extract.get("prem",{})
  90. logging.info(dict_pack)
  91. for _key in dict_pack.keys():
  92. if dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
  93. extract_count += 1
  94. if bidding_budget=="":
  95. bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
  96. for _role in dict_pack[_key]["roleList"]:
  97. if isinstance(_role,list):
  98. extract_count += 1
  99. if _role[2]!='' and float(_role[2])>0:
  100. extract_count += 1
  101. if _role[0]=="tenderee":
  102. tenderee = _role[1]
  103. if _role[0]=="win_tenderer":
  104. if win_tenderer=="":
  105. win_tenderer = _role[1]
  106. if _role[2]!='' and float(_role[2])>0:
  107. extract_count += 1
  108. if win_bid_price=="":
  109. win_bid_price = str(float(_role[2]))
  110. if _role[0]=="agency":
  111. agency = _role[1]
  112. if isinstance(_role,dict):
  113. extract_count += 1
  114. if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
  115. extract_count += 1
  116. if _role["role_name"]=="tenderee":
  117. tenderee = _role["role_text"]
  118. if _role["role_name"]=="win_tenderer":
  119. if win_tenderer=="":
  120. win_tenderer = _role["role_text"]
  121. if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
  122. extract_count += 1
  123. if win_bid_price=="":
  124. win_bid_price = str(float(_role["role_money"]["money"]))
  125. if _role["role_name"]=="agency":
  126. agency = _role["role_text"]
  127. if project_code!="":
  128. extract_count += 1
  129. if project_name!="":
  130. extract_count += 1
  131. logging.info(page_time+doctitle+doctitle_refine+area+province+city+
  132. district+web_source_no+project_code+project_name+tenderee+agency+win_tenderer+bidding_budget+win_bid_price)
  133. self.forward(page_time,page_time_stamp,docchannel,doctitle,doctitle_refine,area,province,city,
  134. district,web_source_no,fingerprint,project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,
  135. time_bidclose,time_bidopen,time_bidstart,time_commencement,time_completion,time_earnest_money_end,time_earnest_money_start,
  136. time_get_file_end,time_get_file_start,time_publicity_end,time_publicity_start,time_registration_end,time_registration_start,time_release)
  137. @annotate("string->string")
  138. class f_get_product(object):
  139. def __init__(self):
  140. import time
  141. global time
  142. import logging
  143. import json
  144. import re
  145. global json,logging,re
  146. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  147. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  148. def evaluate(self, extractjson):
  149. if extractjson is None or extractjson=="":
  150. extractjson = "{}"
  151. _extract = json.loads(extractjson)
  152. return ",".join(_extract.get("product",[]))
  153. @annotate("string->string")
  154. class f_get_package(object):
  155. def __init__(self):
  156. import time
  157. global time
  158. import logging
  159. import json
  160. import re
  161. global json,logging,re
  162. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  163. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  164. def evaluate(self, extractjson):
  165. if extractjson is None or extractjson=="":
  166. extractjson = "{}"
  167. _extract = json.loads(extractjson)
  168. prem = _extract.get("prem",{})
  169. list_pack = []
  170. for k,v in prem.items():
  171. if k!="Project":
  172. list_pack.append(k)
  173. return ",".join(list_pack)
  174. @annotate("string,string->string")
  175. class f_get_nlp_enterprise(object):
  176. def __init__(self):
  177. import time
  178. global time
  179. import logging
  180. import json
  181. import re
  182. global json,logging,re
  183. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  184. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  185. def evaluate(self, doctextcon,extractjson):
  186. if extractjson is None or extractjson=="":
  187. extractjson = "{}"
  188. _extract = json.loads(extractjson)
  189. nlp_enterprise = _extract.get("nlp_enterprise",[])
  190. dict_pack = _extract.get("prem",{})
  191. for _key in dict_pack.keys():
  192. for _role in dict_pack[_key]["roleList"]:
  193. if isinstance(_role,list):
  194. _entity = _role[1]
  195. nlp_enterprise.append(_entity)
  196. if isinstance(_role,dict):
  197. _entity = _role["role_text"]
  198. nlp_enterprise.append(_entity)
  199. list_entity = list(set(nlp_enterprise))
  200. dict_entity = {"indoctextcon":[],
  201. "notindoctextcon":[]}
  202. for _entity in list_entity:
  203. if str(doctextcon).find(_entity)>=0:
  204. dict_entity["indoctextcon"].append(_entity)
  205. else:
  206. dict_entity["notindoctextcon"].append(_entity)
  207. return json.dumps(dict_entity,ensure_ascii=False)
  208. @annotate("string->bigint")
  209. class f_get_extractCount(object):
  210. def __init__(self):
  211. import time
  212. global time
  213. import logging
  214. import json
  215. import re
  216. global json,logging,re
  217. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  218. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  219. def evaluate(self, extractjson):
  220. if extractjson is not None:
  221. _extract = json.loads(extractjson)
  222. else:
  223. _extract = {}
  224. dict_pack = _extract.get("prem",{})
  225. extract_count = 0
  226. list_code = _extract.get("code",[])
  227. if len(list_code)>0:
  228. project_code = list_code[0]
  229. else:
  230. project_code = ""
  231. project_name = _extract.get("name","")
  232. bidding_budget = ""
  233. win_tenderer = ""
  234. win_bid_price = ""
  235. for _key in dict_pack.keys():
  236. if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
  237. extract_count += 1
  238. if bidding_budget=="":
  239. bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
  240. for _role in dict_pack[_key]["roleList"]:
  241. if isinstance(_role,list):
  242. extract_count += 1
  243. if _role[2]!='' and float(_role[2])>0:
  244. extract_count += 1
  245. if _role[0]=="tenderee":
  246. tenderee = _role[1]
  247. if _role[0]=="win_tenderer":
  248. if win_tenderer=="":
  249. win_tenderer = _role[1]
  250. if _role[2]!='' and float(_role[2])>0:
  251. extract_count += 1
  252. if win_bid_price=="":
  253. win_bid_price = str(float(_role[2]))
  254. if _role[0]=="agency":
  255. agency = _role[1]
  256. if isinstance(_role,dict):
  257. extract_count += 1
  258. if "role_money" in _role:
  259. if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
  260. extract_count += 1
  261. if _role.get("role_name")=="tenderee":
  262. tenderee = _role["role_text"]
  263. if _role.get("role_name")=="win_tenderer":
  264. if win_tenderer=="":
  265. win_tenderer = _role["role_text"]
  266. if "role_money" in _role:
  267. if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
  268. extract_count += 1
  269. if win_bid_price=="":
  270. win_bid_price = str(float(_role["role_money"]["money"]))
  271. if _role["role_name"]=="agency":
  272. agency = _role["role_text"]
  273. if project_code!="":
  274. extract_count += 1
  275. if project_name!="":
  276. extract_count += 1
  277. return extract_count
  278. @annotate('string,string,string,string,string -> string,string,string,bigint')
  279. class f_decode_sub_docs_json(BaseUDTF):
  280. def __init__(self):
  281. import logging
  282. import json
  283. global json,logging
  284. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  285. def process(self, project_code,project_name,tenderee,agency,sub_docs_json):
  286. columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
  287. extract_count = 0
  288. if project_code is not None and project_code!="":
  289. extract_count += 1
  290. if project_name is not None and project_name!="":
  291. extract_count += 1
  292. if tenderee is not None and tenderee!="":
  293. extract_count += 1
  294. if agency is not None and agency!="":
  295. extract_count += 1
  296. if sub_docs_json is not None:
  297. for sub_docs in json.loads(sub_docs_json):
  298. for _key_sub_docs in sub_docs.keys():
  299. extract_count += 1
  300. if _key_sub_docs in columns:
  301. if columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
  302. if _key_sub_docs in ["bidding_budget","win_bid_price"]:
  303. if float(sub_docs[_key_sub_docs])>0:
  304. columns[_key_sub_docs] = str(float(sub_docs[_key_sub_docs]))
  305. else:
  306. columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
  307. self.forward(columns["win_tenderer"],columns["bidding_budget"],columns["win_bid_price"],extract_count)
  308. @annotate("string->bigint")
  309. class totimestamp(object):
  310. def __init__(self):
  311. import time
  312. global time
  313. import logging
  314. import json
  315. import re
  316. global json,logging,re
  317. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  318. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  319. def evaluate(self, str_time):
  320. try:
  321. logging.info(str_time)
  322. if str_time is not None and re.search(self.time_pattern,str_time) is not None:
  323. timeArray = time.strptime(str_time[:10], "%Y-%m-%d")
  324. timeStamp = int(time.mktime(timeArray))
  325. return timeStamp
  326. else:
  327. return 0
  328. except Exception as e:
  329. return 0
  330. @annotate("string->string")
  331. class refind_name(object):
  332. def __init__(self):
  333. import logging
  334. import re
  335. global logging,re
  336. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  337. def evaluate(self, title):
  338. if title is not None:
  339. return re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|\[|\]|【|】', '', title)
  340. return ""
  341. @annotate('bigint,bigint,bigint,string,bigint,string->string')
  342. class f_set_docid(BaseUDAF):
  343. '''
  344. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  345. '''
  346. def __init__(self):
  347. import json
  348. global json
  349. def new_buffer(self):
  350. return [[]]
  351. def iterate(self, buffer,docid, page_time_stamp,extract_count,defind_column,defind_count,tenderee):
  352. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,
  353. "defind_column":defind_column,"defind_count":defind_count,"tenderee":tenderee})
  354. def merge(self, buffer, pbuffer):
  355. buffer[0].extend(pbuffer[0])
  356. def terminate(self, buffer):
  357. list_docs = buffer[0]
  358. list_docs.sort(key=lambda x:x["page_time_stamp"])
  359. list_group = []
  360. _begin = 0
  361. defind_count = 0
  362. if len(list_docs)>0:
  363. defind_count = list_docs[0]["defind_count"]
  364. print(defind_count)
  365. for i in range(len(list_docs)-1):
  366. if abs(list_docs[i]["page_time_stamp"]-list_docs[i+1]["page_time_stamp"])<=86400*2:
  367. continue
  368. else:
  369. _group = []
  370. _set_column = set()
  371. _set_tenderee = set()
  372. for j in range(_begin,i+1):
  373. if list_docs[j]["tenderee"] is not None and list_docs[j]["tenderee"]!="":
  374. _set_tenderee.add(list_docs[j]["tenderee"])
  375. _set_column.add(list_docs[j]["defind_column"])
  376. _group.append({"docid":list_docs[j]["docid"],"extract_count":list_docs[j]["extract_count"]})
  377. if len(_group)>=3 and len(_set_tenderee)>1:
  378. pass
  379. else:
  380. print(defind_count,len(_set_column))
  381. if len(_group)>1:
  382. if defind_count==2:
  383. if len(_set_column)>=2:
  384. list_group.append(_group)
  385. elif defind_count==1:
  386. if len(_set_column)==1:
  387. list_group.append(_group)
  388. elif defind_count==0:
  389. list_group.append(_group)
  390. _begin = i+1
  391. if len(list_docs)>1:
  392. _set_column = set()
  393. _set_tenderee = set()
  394. _group = []
  395. for j in range(_begin,len(list_docs)):
  396. if list_docs[j]["tenderee"] is not None and list_docs[j]["tenderee"]!="":
  397. _set_tenderee.add(list_docs[j]["tenderee"])
  398. _set_column.add(list_docs[j]["defind_column"])
  399. _group.append({"docid":list_docs[j]["docid"],"extract_count":list_docs[j]["extract_count"]})
  400. if len(_group)>=3 and len(_set_tenderee)>1:
  401. pass
  402. else:
  403. if len(_group)>1:
  404. if defind_count==2:
  405. if len(_set_column)>=2:
  406. list_group.append(_group)
  407. elif defind_count==1:
  408. if len(_set_column)==1:
  409. list_group.append(_group)
  410. elif defind_count==0:
  411. list_group.append(_group)
  412. return json.dumps(list_group)
  413. # def terminate(self, buffer):
  414. #
  415. #
  416. # list_docs = buffer[0]
  417. # if len(list_docs)>0:
  418. # defind_count = list_docs[0]["defind_count"]
  419. #
  420. # list_time_group = split_with_time(list_docs,"page_time_stamp",86400*2)
  421. #
  422. # list_group = []
  423. # for time_group in list_time_group:
  424. # _group = []
  425. # _set_column = set()
  426. # base_tenderee = ""
  427. # _set_tenderee = set()
  428. # for j in range(len(time_group)):
  429. # if time_group[j]["tenderee"] is not None and time_group[j]["tenderee"]!="":
  430. # # if base_tenderee =="":
  431. # # base_tenderee = time_group[j]["tenderee"]
  432. # # _set_tenderee.add(time_group[j]["tenderee"])
  433. # # simi = getSimilarityOfString(base_tenderee,time_group[j]["tenderee"])
  434. # # if simi<0.8:
  435. # # _set_tenderee.add(time_group[j]["tenderee"])
  436. #
  437. # _set_tenderee.add(time_group[j]["tenderee"])
  438. # _set_column.add(time_group[j]["defind_column"])
  439. # _group.append({"docid":time_group[j]["docid"],"extract_count":time_group[j]["extract_count"]})
  440. #
  441. # if len(_group)>=3 and len(_set_tenderee)>1:
  442. # pass
  443. # else:
  444. # if len(_group)>1:
  445. # if defind_count==2:
  446. # if len(_set_column)>=2:
  447. # list_group.append(_group)
  448. # elif defind_count==1:
  449. # if len(_set_column)==1:
  450. # list_group.append(_group)
  451. # elif defind_count==0:
  452. # list_group.append(_group)
  453. #
  454. # return json.dumps(list_group)
  455. def isEmpty(_str):
  456. if _str is None or _str=="":
  457. return True
  458. return False
  459. @annotate('bigint->string')
  460. class f_group_fingerprint(BaseUDAF):
  461. def __init__(self):
  462. import json
  463. global json
  464. def new_buffer(self):
  465. return [[]]
  466. def iterate(self, buffer,docid):
  467. buffer[0].append(docid)
  468. def merge(self, buffer, pbuffer):
  469. buffer[0].extend(pbuffer[0])
  470. def terminate(self, buffer):
  471. list_docid = buffer[0]
  472. list_docid.sort(key=lambda x:x)
  473. return ",".join([str(a) for a in list_docid])
  474. @annotate('string->bigint,string')
  475. class f_ungroup_fingerprint(BaseUDTF):
  476. def process(self,dumplicates):
  477. list_docid = dumplicates.split(",")
  478. self.forward(int(list_docid[0]),",".join(list_docid[1:]))
  479. @annotate('bigint,bigint,string->string')
  480. class f_dump_probability(BaseUDAF):
  481. '''
  482. 合并组为一条记录
  483. '''
  484. def __init__(self):
  485. import json
  486. global json
  487. def new_buffer(self):
  488. return [[]]
  489. def iterate(self, buffer,docid,page_time_stamp,_type):
  490. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"type":_type})
  491. def merge(self, buffer, pbuffer):
  492. buffer[0].extend(pbuffer[0])
  493. def terminate(self, buffer):
  494. list_dict = buffer[0]
  495. list_dict = list_dict[:10000]
  496. list_group = split_with_time(list_dict,sort_key="page_time_stamp",timedelta=86400*2)
  497. return json.dumps(list_group)
  498. @annotate('string -> bigint,bigint,bigint,bigint,string')
  499. class f_split_dumplicate_probability(BaseUDTF):
  500. def __init__(self):
  501. import logging
  502. import json
  503. global logging,json
  504. logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  505. def process(self,list_group_str):
  506. logging.info("0")
  507. logging.info(list_group_str)
  508. if list_group_str is not None:
  509. logging.info("1")
  510. try:
  511. list_group = json.loads(list_group_str)
  512. logging.info("2")
  513. for _group in list_group:
  514. if len(_group)>0:
  515. _type = _group[0].get("type","")
  516. logging.info("3%d"%len(list_group))
  517. # _group.sort(key=lambda x:x["page_time_stamp"])
  518. _len = min(100,len(_group))
  519. for _index_i in range(_len):
  520. _count = 0
  521. for _index_j in range(_index_i+1,_len):
  522. if abs(_group[_index_j]["page_time_stamp"]-_group[_index_i]["page_time_stamp"])>86400*120:
  523. break
  524. _count += 1
  525. _docid1 = _group[_index_i]["docid"]
  526. _docid2 = _group[_index_j]["docid"]
  527. if _docid1<_docid2:
  528. self.forward(_docid1,_docid2,1,_len,_type)
  529. else:
  530. self.forward(_docid2,_docid1,1,_len,_type)
  531. except Exception as e:
  532. logging(str(e))
  533. @annotate('bigint,bigint,string->string')
  534. class f_dumplicate_groupPairs(BaseUDAF):
  535. '''
  536. 合并组为一条记录
  537. '''
  538. def __init__(self):
  539. import json
  540. global json
  541. def new_buffer(self):
  542. return [[]]
  543. def iterate(self, buffer,is_exists,counts,_type):
  544. buffer[0].append({"is_exists":is_exists,"counts":counts,"_type":_type})
  545. def merge(self, buffer, pbuffer):
  546. buffer[0].extend(pbuffer[0])
  547. def terminate(self, buffer):
  548. list_dict = buffer[0]
  549. list_dict = list_dict[:10000]
  550. return json.dumps(list_dict)
  551. def check_columns(tenderee_less,tenderee_greater,
  552. agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
  553. win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
  554. bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater):
  555. flag = True
  556. _set_tenderee = set()
  557. if tenderee_less is not None and tenderee_less!="":
  558. _set_tenderee.add(tenderee_less)
  559. if tenderee_greater is not None and tenderee_greater!="":
  560. _set_tenderee.add(tenderee_greater)
  561. if len(_set_tenderee)>1:
  562. return False
  563. code_sim = getSimilarityOfString(project_code_less,project_code_greater)
  564. if code_sim>0.6 and code_sim<1:
  565. return False
  566. #同批次不同编号
  567. if getLength(project_code_less)>0 and getLength(project_code_greater)>0:
  568. _split_code_less = project_code_less.split("-")
  569. _split_code_greater = project_code_greater.split("-")
  570. if len(_split_code_less)>1 and len(_split_code_greater)>1:
  571. if _split_code_less[0]==_split_code_greater[0] and project_code_less!=project_code_greater:
  572. return False
  573. _set_win_tenderer = set()
  574. if win_tenderer_less is not None and win_tenderer_less!="":
  575. _set_win_tenderer.add(win_tenderer_less)
  576. if win_tenderer_greater is not None and win_tenderer_greater!="":
  577. _set_win_tenderer.add(win_tenderer_greater)
  578. if len(_set_win_tenderer)>1:
  579. return False
  580. _set_win_bid_price = set()
  581. if win_bid_price_less is not None and win_bid_price_less!="":
  582. _set_win_bid_price.add(float(win_bid_price_less))
  583. if win_bid_price_greater is not None and win_bid_price_greater!="":
  584. _set_win_bid_price.add(float(win_bid_price_greater))
  585. if len(_set_win_bid_price)>1:
  586. return False
  587. _set_bidding_budget = set()
  588. if bidding_budget_less is not None and bidding_budget_less!="":
  589. _set_bidding_budget.add(float(bidding_budget_less))
  590. if bidding_budget_greater is not None and bidding_budget_greater!="":
  591. _set_bidding_budget.add(float(bidding_budget_greater))
  592. if len(_set_bidding_budget)>1:
  593. return False
  594. return True
  595. import math
  596. def featurnCount(_count,max_count=100):
  597. return max(0,min(1,_count))*(1/math.sqrt(max(1,_count-1)))
  598. def getSimLevel(str1,str2):
  599. str1_null = False
  600. str2_null = False
  601. _v = 0
  602. if str1 is None or str1=="":
  603. str1_null = True
  604. if str2 is None or str2=="":
  605. str2_null = True
  606. if str1_null and str2_null:
  607. _v = 2
  608. elif str1_null and not str2_null:
  609. _v = 4
  610. elif not str1_null and str2_null:
  611. _v = 6
  612. elif not str1_null and not str2_null:
  613. if str1==str2:
  614. _v = 10
  615. else:
  616. _v = 0
  617. return _v
  618. def getLength(_str):
  619. return len(_str if _str is not None else "")
  620. def check_money(bidding_budget_less,bidding_budget_greater,
  621. win_bid_price_less,win_bid_price_greater):
  622. #check saming
  623. budget_is_same = ""
  624. price_is_same = ""
  625. if getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  626. budget_less = float(bidding_budget_less)
  627. budget_greater = float(bidding_budget_greater)
  628. if budget_less!=budget_greater:
  629. if max(budget_less,budget_greater)/min(budget_less,budget_greater)==10000:
  630. budget_is_same = True
  631. if budget_less>10000 and budget_greater>10000 and round(budget_less/10000,2)==round(budget_greater/10000,2):
  632. budget_is_same = True
  633. if budget_is_same=="":
  634. return False
  635. if getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  636. price_less = float(win_bid_price_less)
  637. price_greater = float(win_bid_price_greater)
  638. if price_less!=price_greater:
  639. if max(price_less,price_greater)/min(price_less,price_greater)==10000:
  640. price_is_same = True
  641. if price_less>10000 and price_greater>10000 and round(price_less/10000,2)==round(price_greater/10000,2):
  642. price_is_same = True
  643. if price_is_same=="":
  644. return False
  645. return True
  646. def check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  647. tenderee_less,tenderee_greater,
  648. agency_less,agency_greater,
  649. win_tenderer_less,win_tenderer_greater,
  650. similarity=0.85):
  651. def get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,entity_less,entity_greater,similarity):
  652. if getLength(entity_less)>0 and getLength(entity_greater)>0:
  653. if entity_less!=entity_greater:
  654. is_same = ''
  655. _sim = getSimilarityOfString(entity_less,entity_greater)
  656. if _sim>similarity:
  657. is_same = True
  658. if is_same=='':
  659. if str(nlp_enterprise_less).find(entity_greater)>0 or str(nlp_enterprise_greater).find(entity_less)>0:
  660. is_same = True
  661. if is_same=='':
  662. return False
  663. return True
  664. if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,similarity):
  665. return False
  666. if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,agency_less,agency_greater,similarity):
  667. return False
  668. if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,win_tenderer_less,win_tenderer_greater,similarity):
  669. return False
  670. return True
  671. def check_codes(project_codes_less,project_codes_greater):
  672. #check the similarity
  673. is_same = False
  674. is_sim = False
  675. for project_code_less in project_codes_less:
  676. for project_code_greater in project_codes_greater:
  677. code_sim = getSimilarityOfString(project_code_less,project_code_greater)
  678. if code_sim>0.6 and code_sim<1:
  679. is_sim = True
  680. if code_sim==1:
  681. is_same = True
  682. if is_same:
  683. return True
  684. if is_sim:
  685. return False
  686. return True
  687. def check_demand():
  688. return True
  689. package_number_pattern = re.compile("(?P<name>(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型|项目)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.]?)[^至]?|((?![\.])第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包)))") # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
  690. code_pattern = re.compile("[A-Za-z0-9\-\(\)()【】\.]+")
  691. num_pattern = re.compile("^\d+(?:\.\d+)?$")
  692. num1_pattern = re.compile("[一二三四五六七八九]+")
  693. location_pattern = re.compile(".{1,2}[市区镇县村路]")
  694. building_pattern = "工程招标代理|工程设计|工程造价咨询|施工图设计文件审查|咨询|环评|设计|施工监理|施工|监理|EPC|epc|总承包|水土保持|选址论证|勘界|勘察|预算编制|预算审核|设备类|第?[\((]?[一二三四五六七八九1-9][)\)]?[次批]"
  695. date_pattern = re.compile("\d{2,4}[\-\./年]\d{1,2}[\-\./月]\d{1,2}")
  696. def check_doctitle(doctitle_refind_less,doctitle_refind_greater,codes_less=[],code_greater=[]):
  697. for _c in codes_less:
  698. doctitle_refind_less = str(doctitle_refind_less).replace(_c,"")
  699. for _c in code_greater:
  700. doctitle_refind_greater = str(doctitle_refind_greater).replace(_c,"")
  701. doctitle_refind_less = re.sub(date_pattern,"",doctitle_refind_less)
  702. doctitle_refind_greater = re.sub(date_pattern,"",doctitle_refind_greater)
  703. #check the package
  704. if doctitle_refind_less is None:
  705. doctitle_refind_less = ""
  706. if doctitle_refind_greater is None:
  707. doctitle_refind_greater = ""
  708. _pack1 = None
  709. _pack2 = None
  710. #if contain then pass
  711. if doctitle_refind_less.find(doctitle_refind_greater)>=0 or doctitle_refind_greater.find(doctitle_refind_less)>=0:
  712. return True
  713. #check the package in title
  714. _match = re.search(package_number_pattern,doctitle_refind_less)
  715. if _match is not None:
  716. _pack1 = _match.groupdict()["name"]
  717. _match = re.search(package_number_pattern,doctitle_refind_greater)
  718. if _match is not None:
  719. _pack2 = _match.groupdict()["name"]
  720. if _pack1 is not None and _pack2 is not None:
  721. print(_pack1,_pack2)
  722. if _pack1!=_pack2:
  723. return False
  724. #check the nums in title
  725. doctitle_refind_less = re.sub(package_number_pattern,"",doctitle_refind_less)
  726. doctitle_refind_greater = re.sub(package_number_pattern,"",doctitle_refind_greater)
  727. #check the nums,location,building in title
  728. for _p in [code_pattern]:
  729. num_all_l = re.findall(_p,doctitle_refind_less)
  730. num_all_g = re.findall(_p,doctitle_refind_greater)
  731. set_num_l = set()
  732. set_num_g = set()
  733. for _l in num_all_l:
  734. if re.search(num_pattern,_l) is not None:
  735. if _l.find(".")>0:
  736. set_num_l.add(_l)
  737. elif len(_l)<4:
  738. set_num_l.add(_l)
  739. for _g in num_all_g:
  740. if re.search(num_pattern,_g) is not None:
  741. if _g.find(".")>0:
  742. set_num_g.add(_g)
  743. elif len(_g)<4:
  744. set_num_g.add(_g)
  745. if len(set_num_l)>0 and len(set_num_g)>0:
  746. if len(set_num_l&set_num_g)!=len(set_num_l):
  747. return False
  748. #check location and keywords
  749. for _p in [num1_pattern,location_pattern,building_pattern]:
  750. num_all_l = re.findall(_p,doctitle_refind_less)
  751. num_all_g = re.findall(_p,doctitle_refind_greater)
  752. set_num_l = set(num_all_l)
  753. set_num_g = set(num_all_g)
  754. if len(set_num_l)==len(set_num_g):
  755. if len(set_num_l&set_num_g)!=len(set_num_l):
  756. return False
  757. return True
  758. def check_product(product_less,product_greater,split_char=","):
  759. if getLength(product_less)>0 and getLength(product_greater)>0:
  760. _product_l = product_less.split(split_char)
  761. _product_g = product_greater.split(split_char)
  762. for _l in _product_l:
  763. for _g in _product_g:
  764. if getSimilarityOfString(_l,_g)>=0.8:
  765. return True
  766. return False
  767. return True
  768. def check_package(package_less,package_greater,split_char=","):
  769. if getLength(package_less)>0 and getLength(package_greater)>0:
  770. _product_l = package_less.split(split_char)
  771. _product_g = package_greater.split(split_char)
  772. for _l in _product_l:
  773. for _g in _product_g:
  774. if _l==_g:
  775. return True
  776. return False
  777. return True
  778. def check_time(json_time_less,json_time_greater):
  779. if getLength(json_time_less)>0 and getLength(json_time_greater)>0:
  780. time_less = json.loads(json_time_less)
  781. time_greater = json.loads(json_time_greater)
  782. for k,v in time_less.items():
  783. if getLength(v)>0:
  784. v1 = time_greater.get(k,"")
  785. if getLength(v1)>0:
  786. if v!=v1:
  787. return False
  788. return True
  789. @annotate("string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string,double")
  790. class f_dumplicate_featureMatrix(BaseUDTF):
  791. def __init__(self):
  792. import logging
  793. import json
  794. global logging,json
  795. def process(self,json_context,docchannel_less,docchannel_greater,page_time_less,page_time_greater,nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,
  796. agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
  797. win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
  798. bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater,product_less,product_greater):
  799. #check the page_time by special docchannel
  800. if docchannel_less in (51,102,103,104,115,116,117):
  801. if doctitle_refine_less!=doctitle_refine_greater:
  802. if page_time_less!=page_time_greater:
  803. self.forward("[1-%s]"%(str(docchannel_less)),0)
  804. return
  805. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,[str(project_code_less)],[str(project_code_greater)]):
  806. self.forward("[2-%s]"%(str(doctitle_refine_less)+"=="+str(doctitle_refine_greater)),0)
  807. return
  808. # if not check_codes([project_code_less],[project_code_greater]):
  809. # self.forward("[3-%s]"%(str(project_code_less)+"=="+str(project_code_greater)),0)
  810. # return
  811. if not check_demand():
  812. self.forward("[4-]",0)
  813. return
  814. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  815. tenderee_less,tenderee_greater,
  816. agency_less,agency_greater,
  817. win_tenderer_less,win_tenderer_greater):
  818. _error = ""
  819. for a in [nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater]:
  820. _error += str(a)
  821. self.forward("[5-%s]"%_error,0)
  822. return
  823. if not check_money(bidding_budget_less,bidding_budget_greater,
  824. win_bid_price_less,win_bid_price_greater):
  825. _error = ""
  826. for a in [bidding_budget_less,bidding_budget_greater,
  827. win_bid_price_less,win_bid_price_greater]:
  828. _error += str(a)
  829. self.forward("[6-%s]"%_error,0)
  830. return
  831. if not check_product(product_less,product_greater):
  832. _error = "%s=%s"%(str(product_less),str(product_greater))
  833. self.forward("7-%s"%_error,0)
  834. return
  835. _context = json.loads(json_context)
  836. min_counts = 100
  837. dict_context = {}
  838. for item in _context:
  839. if item["counts"]<min_counts:
  840. min_counts = item["counts"]
  841. dict_context[item["_type"]] = [item["is_exists"],item["counts"]]
  842. context_key = ["tenderee","agency","project_code","project_name","win_tenderer","win_bid_price","bidding_budget","doctitle_refine"]
  843. list_matrix = []
  844. #get the featurn of the context into matrix
  845. # for index_i in range(len(context_key)):
  846. # for index_j in range(index_i+1,len(context_key)):
  847. # _key = "%s&%s"%(context_key[index_i],context_key[index_j])
  848. # _v = featurnCount(dict_context.get(_key,[0,0])[1])
  849. # list_matrix.append(_v)
  850. # context3_key = ["tenderee","agency","win_tenderer","win_bid_price","bidding_budget"]
  851. # for index_i in range(len(context3_key)):
  852. # for index_j in range(index_i+1,len(context3_key)):
  853. # for index_k in range(index_j+1,len(context3_key)):
  854. # _key = "%s&%s&%s"%(context3_key[index_i],context3_key[index_j],context3_key[index_k])
  855. # _v = featurnCount(dict_context.get(_key,[0,0])[1])
  856. # list_matrix.append(_v)
  857. # list_matrix.append(getSimLevel(tenderee_less,tenderee_greater)/10)
  858. # list_matrix.append(getSimLevel(agency_less,agency_greater)/10)
  859. # list_matrix.append(getSimilarityOfString(project_code_less,project_code_greater))
  860. # list_matrix.append(getSimilarityOfString(project_name_less,project_name_greater))
  861. # list_matrix.append(getSimLevel(win_tenderer_less,win_tenderer_greater)/10)
  862. # list_matrix.append(getSimLevel(win_bid_price_less,win_bid_price_greater)/10)
  863. # list_matrix.append(getSimLevel(bidding_budget_less,bidding_budget_greater)/10)
  864. # list_matrix.append(getSimilarityOfString(doctitle_refine_less,doctitle_refine_greater))
  865. json_matrix = json.dumps(list_matrix)
  866. same_count = 0
  867. all_count = 8
  868. if getSimilarityOfString(project_code_less,project_code_greater)==1:
  869. same_count += 1
  870. if getSimilarityOfString(tenderee_less,tenderee_greater)==1:
  871. same_count += 1
  872. if getSimilarityOfString(agency_less,agency_greater)==1:
  873. same_count += 1
  874. if getSimilarityOfString(win_tenderer_less,win_tenderer_greater)==1:
  875. same_count += 1
  876. if getSimilarityOfString(bidding_budget_less,bidding_budget_greater)==1:
  877. same_count += 1
  878. if getSimilarityOfString(win_bid_price_less,win_bid_price_greater)==1:
  879. same_count += 1
  880. if getSimilarityOfString(project_name_less,project_name_greater)==1:
  881. same_count += 1
  882. if getSimilarityOfString(doctitle_refine_less,doctitle_refine_greater)==1:
  883. same_count += 1
  884. base_prob = 0
  885. if min_counts<3:
  886. base_prob = 0.9
  887. elif min_counts<5:
  888. base_prob = 0.8
  889. elif min_counts<8:
  890. base_prob = 0.7
  891. else:
  892. base_prob = 0.6
  893. _prob = base_prob*same_count/all_count
  894. json_matrix = "[==%s]"%(str(base_prob)+"="+str(same_count)+"="+str(all_count)+str(product_less)+str(product_greater))
  895. self.forward(json_matrix,_prob)
  896. return
  897. @annotate('bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,bigint,double->string')
  898. class f_redump_probability_final_check(BaseUDAF):
  899. '''
  900. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  901. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  902. '''
  903. def __init__(self):
  904. import logging
  905. import json,re
  906. global json,logging,re
  907. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  908. def new_buffer(self):
  909. return [list()]
  910. def iterate(self, buffer,main_docid,docid,newly,docchannel,nlp_enterprise,product,package,json_dicttime,page_time,project_code,doctitle_refine,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,confidence):
  911. buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"nlp_enterprise":nlp_enterprise,"product":product,"package":package,"json_dicttime":json_dicttime,"page_time":page_time,
  912. "project_code":project_code,"doctitle_refine":doctitle_refine,"tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,
  913. "win_bid_price":win_bid_price,"extract_count":extract_count,"confidence":confidence})
  914. def merge(self, buffer, pbuffer):
  915. buffer[0].extend(pbuffer[0])
  916. def terminate(self, buffer):
  917. list_group = []
  918. the_group = buffer[0]
  919. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  920. _index = 0
  921. if len(the_group)>0:
  922. _index = 1
  923. while _index<len(the_group):
  924. document_greater = the_group[_index]
  925. docchannel_greater = document_greater["docchannel"]
  926. page_time_greater = document_greater["page_time"]
  927. doctitle_refine_greater = document_greater["doctitle_refine"]
  928. project_code_greater = document_greater["project_code"]
  929. nlp_enterprise_greater = document_greater["nlp_enterprise"]
  930. tenderee_greater = document_greater["tenderee"]
  931. agency_greater = document_greater["agency"]
  932. win_tenderer_greater = document_greater["win_tenderer"]
  933. bidding_budget_greater = document_greater["bidding_budget"]
  934. win_bid_price_greater = document_greater["win_bid_price"]
  935. product_greater = document_greater["product"]
  936. package_greater = document_greater["package"]
  937. json_time_greater = document_greater["json_dicttime"]
  938. _less_index = 0
  939. while _less_index<_index:
  940. document_less = the_group[_less_index]
  941. docchannel_less = document_less["docchannel"]
  942. page_time_less = document_less["page_time"]
  943. doctitle_refine_less = document_less["doctitle_refine"]
  944. project_code_less = document_less["project_code"]
  945. nlp_enterprise_less = document_less["nlp_enterprise"]
  946. tenderee_less = document_less["tenderee"]
  947. agency_less = document_less["agency"]
  948. win_tenderer_less = document_less["win_tenderer"]
  949. bidding_budget_less = document_less["bidding_budget"]
  950. win_bid_price_less = document_less["win_bid_price"]
  951. product_less = document_less["product"]
  952. package_less = document_less["package"]
  953. json_time_less = document_less["json_dicttime"]
  954. check_result = {"pass":1}
  955. if docchannel_less in (51,102,103,104,115,116,117):
  956. if doctitle_refine_less!=doctitle_refine_greater:
  957. if page_time_less!=page_time_greater:
  958. check_result["docchannel"] = 0
  959. check_result["pass"] = 0
  960. else:
  961. check_result["docchannel"] = 2
  962. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,[str(project_code_less)],[str(project_code_greater)]):
  963. check_result["doctitle"] = 0
  964. check_result["pass"] = 0
  965. logging.info("check_doctitle_failed:%s==%s"%(str(doctitle_refine_less),str(doctitle_refine_greater)))
  966. else:
  967. check_result["doctitle"] = 2
  968. if not check_codes([project_code_less],[project_code_greater]):
  969. check_result["code"] = 0
  970. check_result["pass"] = 0
  971. logging.info("check_code_failed:%s==%s"%(str(project_code_less),str(project_code_greater)))
  972. else:
  973. if getLength(project_code_less)>0 and getLength(project_code_greater)>0 and project_code_less==project_code_greater:
  974. check_result["code"] = 2
  975. else:
  976. check_result["code"] = 1
  977. if not check_product(product_less,product_greater):
  978. check_result["product"] = 0
  979. check_result["pass"] = 0
  980. logging.info("check_product_failed:%s==%s"%(str(product_less),str(product_greater)))
  981. else:
  982. if getLength(product_less)>0 and getLength(product_greater)>0:
  983. check_result["product"] = 2
  984. else:
  985. check_result["product"] = 1
  986. if not check_demand():
  987. check_result["pass"] = 0
  988. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  989. tenderee_less,tenderee_greater,
  990. agency_less,agency_greater,
  991. win_tenderer_less,win_tenderer_greater):
  992. check_result["entity"] = 0
  993. check_result["pass"] = 0
  994. logging.info("check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
  995. else:
  996. if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
  997. check_result["entity"] = 2
  998. elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
  999. check_result["entity"] = 2
  1000. else:
  1001. check_result["entity"] = 1
  1002. if not check_money(bidding_budget_less,bidding_budget_greater,
  1003. win_bid_price_less,win_bid_price_greater):
  1004. logging.info("check_money_failed:%s==%s==%s==%s"%(str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
  1005. check_result["money"] = 0
  1006. check_result["pass"] = 0
  1007. else:
  1008. if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  1009. check_result["money"] = 2
  1010. elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  1011. check_result["money"] = 2
  1012. else:
  1013. check_result["money"] = 1
  1014. if not check_package(package_less,package_greater):
  1015. logging.info("check_package_failed:%s==%s"%(str(package_less),str(package_greater)))
  1016. check_result["package"] = 0
  1017. check_result["pass"] = 0
  1018. else:
  1019. if getLength(package_less)>0 and getLength(package_greater)>0:
  1020. check_result["package"] = 2
  1021. else:
  1022. check_result["package"] = 1
  1023. if not check_time(json_time_less,json_time_greater):
  1024. logging.info("check_time_failed:%s==%s"%(str(json_time_less),str(json_time_greater)))
  1025. check_result["time"] = 0
  1026. check_result["pass"] = 0
  1027. else:
  1028. if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
  1029. check_result["time"] = 2
  1030. else:
  1031. check_result["time"] = 1
  1032. if check_result.get("pass",0)==0:
  1033. logging.info(str(check_result))
  1034. if check_result.get("time",1)==0:
  1035. break
  1036. if check_result.get("money",1)==0:
  1037. break
  1038. if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2:
  1039. pass
  1040. else:
  1041. break
  1042. _less_index += 1
  1043. if _less_index!=_index:
  1044. break
  1045. _index += 1
  1046. dumplicates = ""
  1047. if _index>1:
  1048. logging.info("index/whole:%d/%d"%(_index,len(the_group)))
  1049. final_group = the_group[:_index]
  1050. final_group.sort(key=lambda x:x["docid"])
  1051. final_group.sort(key=lambda x:x["extract_count"],reverse=True)
  1052. _set = set()
  1053. for _d in final_group:
  1054. _docid = _d["docid"]
  1055. if _docid in _set:
  1056. continue
  1057. dumplicates += "%d,"%_docid
  1058. _set.add(_docid)
  1059. dumplicates = dumplicates[:-1]
  1060. return dumplicates
  1061. @annotate('bigint,bigint,bigint,string,string,string,string,string,string,string,string->string')
  1062. class f_set_docid_binaryChart(BaseUDAF):
  1063. '''
  1064. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  1065. '''
  1066. def __init__(self):
  1067. import json
  1068. global json
  1069. def new_buffer(self):
  1070. return [[]]
  1071. def iterate(self, buffer,docid, page_time_stamp,extract_count,project_code,project_name,tenderee,bidding_budget,win_tenderer,win_bid_price,agency,web_source_no):
  1072. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,
  1073. "project_code":project_code,"project_name":project_name,"tenderee":tenderee,
  1074. "bidding_budget":bidding_budget,"win_tenderer":win_tenderer,"win_bid_price":win_bid_price,
  1075. "agency":agency,"web_source_no":web_source_no})
  1076. def merge(self, buffer, pbuffer):
  1077. buffer[0].extend(pbuffer[0])
  1078. def terminate(self, buffer):
  1079. list_docs = buffer[0]
  1080. list_timeGroups = split_with_time(list_docs,"page_time_stamp",86400*2)
  1081. list_group = []
  1082. empty_key = ["project_code","bidding_budget","win_tenderer","win_bid_price","agency"]
  1083. for _timeGroups in list_timeGroups:
  1084. list_empty = []
  1085. list_notEmpty = []
  1086. for _item in _timeGroups:
  1087. empty_flag = True
  1088. for _key in empty_key:
  1089. if not isEmpty(_item[_key]):
  1090. empty_flag = False
  1091. break
  1092. if empty_flag:
  1093. list_empty.append(_item)
  1094. else:
  1095. list_notEmpty.append(_item)
  1096. for _e in list_empty:
  1097. _group = [{"docid":_e["docid"],"extract_count":_e["extract_count"]}]
  1098. _e_tenderee = _e["tenderee"]
  1099. for _ne in list_notEmpty:
  1100. if "set_webSource" not in _ne:
  1101. _ne["set_webSource"] = set()
  1102. _ne["set_webSource"].add(_ne["web_source_no"])
  1103. _suit = False
  1104. if not isEmpty(_e_tenderee) and _e_tenderee==_ne["tenderee"]:
  1105. _suit = True
  1106. elif isEmpty(_e_tenderee):
  1107. _suit = True
  1108. if _suit:
  1109. if _e["web_source_no"] not in _ne["set_webSource"]:
  1110. _ne["set_webSource"].add(_e["web_source_no"])
  1111. _group.append({"docid":_ne["docid"],"extract_count":_ne["extract_count"]})
  1112. break
  1113. if len(_group)>1:
  1114. list_group.append(_group)
  1115. return json.dumps(list_group)
  1116. def split_with_time(list_dict,sort_key,timedelta=86400*2):
  1117. if len(list_dict)>0:
  1118. if sort_key in list_dict[0]:
  1119. list_dict.sort(key=lambda x:x[sort_key])
  1120. list_group = []
  1121. _begin = 0
  1122. for i in range(len(list_dict)-1):
  1123. if abs(list_dict[i][sort_key]-list_dict[i+1][sort_key])<timedelta:
  1124. continue
  1125. else:
  1126. _group = []
  1127. for j in range(_begin,i+1):
  1128. _group.append(list_dict[j])
  1129. if len(_group)>1:
  1130. list_group.append(_group)
  1131. _begin = i + 1
  1132. if len(list_dict)>1:
  1133. _group = []
  1134. for j in range(_begin,len(list_dict)):
  1135. _group.append(list_dict[j])
  1136. if len(_group)>1:
  1137. list_group.append(_group)
  1138. return list_group
  1139. return [list_dict]
  1140. @annotate('bigint,bigint,bigint,string,string,string,string,string->string')
  1141. class f_set_docid_limitNum_contain(BaseUDAF):
  1142. '''
  1143. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
  1144. '''
  1145. def __init__(self):
  1146. import logging
  1147. import json,re
  1148. global json,logging,re
  1149. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1150. def new_buffer(self):
  1151. return [list()]
  1152. def iterate(self, buffer,docid,page_time_stamp,extract_count,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column):
  1153. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,"set_limit_column1":set_limit_column1,
  1154. "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
  1155. "contain_column":contain_column})
  1156. def merge(self, buffer, pbuffer):
  1157. buffer[0].extend(pbuffer[0])
  1158. def terminate(self, buffer):
  1159. list_split = split_with_time(buffer[0],"page_time_stamp")
  1160. list_group = []
  1161. for _split in list_split:
  1162. flag = True
  1163. keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
  1164. for _key in keys:
  1165. logging.info(_key+str(getSet(_split,_key)))
  1166. if len(getSet(_split,_key))>1:
  1167. flag = False
  1168. break
  1169. MAX_CONTAIN_COLUMN = None
  1170. #判断组内每条公告是否包含
  1171. if flag:
  1172. for _d in _split:
  1173. contain_column = _d["contain_column"]
  1174. if contain_column is not None and contain_column !="":
  1175. if MAX_CONTAIN_COLUMN is None:
  1176. MAX_CONTAIN_COLUMN = contain_column
  1177. else:
  1178. if len(MAX_CONTAIN_COLUMN)<len(contain_column):
  1179. if contain_column.find(MAX_CONTAIN_COLUMN)==-1:
  1180. flag = False
  1181. break
  1182. MAX_CONTAIN_COLUMN = contain_column
  1183. else:
  1184. if MAX_CONTAIN_COLUMN.find(contain_column)==-1:
  1185. flag = False
  1186. break
  1187. if flag:
  1188. if len(_split)>1:
  1189. _group = []
  1190. for _item in _split:
  1191. _group.append({"docid":_item["docid"],"extract_count":_item["extract_count"]})
  1192. list_group.append(_group)
  1193. return json.dumps(list_group)
  1194. @annotate('bigint->string')
  1195. class f_stamp_squence(BaseUDAF):
  1196. '''
  1197. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  1198. '''
  1199. def __init__(self):
  1200. import json
  1201. global json
  1202. import logging
  1203. global logging
  1204. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1205. def new_buffer(self):
  1206. return [set()]
  1207. def iterate(self, buffer,page_time_stamp):
  1208. buffer[0].add(page_time_stamp)
  1209. def merge(self, buffer, pbuffer):
  1210. buffer[0] |= pbuffer[0]
  1211. def terminate(self, buffer):
  1212. if 0 in buffer[0]:
  1213. buffer[0].remove(0)
  1214. list_stamp = list(buffer[0])
  1215. list_stamp.sort(key=lambda x:x)
  1216. list_stamp_final = []
  1217. _begin = 0
  1218. _time_decase = 86400*2
  1219. logging.info(str(list_stamp))
  1220. for _index in range(len(list_stamp)-1):
  1221. if list_stamp[_index+1]-list_stamp[_index]<_time_decase:
  1222. continue
  1223. else:
  1224. list_stamp_final.append([list_stamp[_begin]-_time_decase,list_stamp[_index]+_time_decase])
  1225. _begin = _index+1
  1226. if len(list_stamp)>0:
  1227. list_stamp_final.append([list_stamp[_begin]-_time_decase,list_stamp[-1]+_time_decase])
  1228. return json.dumps(list_stamp_final)
  1229. @annotate("bigint,string->bigint")
  1230. class in_stamp(object):
  1231. def __init__(self):
  1232. import logging
  1233. import re
  1234. import json
  1235. global logging,re,json
  1236. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1237. def evaluate(self, page_time_stamp,json_stamp):
  1238. list_stamp = json.loads(json_stamp)
  1239. int_flag = 0
  1240. for item in list_stamp:
  1241. if page_time_stamp <item[0]:
  1242. break
  1243. if page_time_stamp>item[0] and page_time_stamp<item[1]:
  1244. int_flag = 1
  1245. break
  1246. return int_flag
  1247. def getConfidence(rule_id):
  1248. if rule_id ==0:
  1249. return 30
  1250. elif rule_id >=1 and rule_id <30:
  1251. return 20
  1252. else:
  1253. return 10
  1254. @annotate('string,string -> string')
  1255. class f_splitStr(BaseUDTF):
  1256. '''
  1257. 将多个组拆解成多条记录
  1258. '''
  1259. def __init__(self):
  1260. import logging
  1261. import json
  1262. global json,logging
  1263. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1264. def process(self, str_split,_split):
  1265. try:
  1266. for _s in str_split.split(_split):
  1267. self.forward(_s)
  1268. except Exception as e:
  1269. pass
  1270. @annotate('string,bigint -> bigint,bigint,bigint,bigint,bigint')
  1271. class f_split_group_single(BaseUDTF):
  1272. '''
  1273. 将多个组拆解成多条记录
  1274. '''
  1275. def __init__(self):
  1276. import logging
  1277. import json
  1278. global json,logging
  1279. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1280. def process(self, json_set_docid,rule_id):
  1281. list_group = json.loads(json_set_docid)
  1282. for item in list_group:
  1283. if len(item)>100:
  1284. item.sort(key=lambda x:x["docid"],reverse=True)
  1285. index_i = 0
  1286. for index_j in range(1,len(item)):
  1287. if item[index_i]["docid"]!=item[index_j]["docid"]:
  1288. self.forward(item[index_i]["docid"],item[index_j]["docid"],item[index_i]["extract_count"],item[index_j]["extract_count"],getConfidence(rule_id))
  1289. else:
  1290. for index_i in range(len(item)):
  1291. for index_j in range(len(item)):
  1292. if index_i!=index_j and item[index_i]["docid"]!=item[index_j]["docid"]:
  1293. self.forward(item[index_i]["docid"],item[index_j]["docid"],item[index_i]["extract_count"],item[index_j]["extract_count"],getConfidence(rule_id))
  1294. @annotate('bigint,string->string')
  1295. class group_document(BaseUDAF):
  1296. '''
  1297. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  1298. '''
  1299. def __init__(self):
  1300. import json
  1301. global json
  1302. def new_buffer(self):
  1303. return [[]]
  1304. def iterate(self, buffer,id,json_set_docid):
  1305. buffer[0].append({"id":id,"json_set_docid":json.loads(json_set_docid)})
  1306. def merge(self, buffer, pbuffer):
  1307. buffer[0].extend(pbuffer[0])
  1308. def terminate(self, buffer):
  1309. return json.dumps(buffer[0])
  1310. @annotate('bigint,string,bigint,string -> bigint,bigint,string')
  1311. class decare_document(BaseUDTF):
  1312. '''
  1313. 将多个组拆解成多条记录
  1314. '''
  1315. def __init__(self):
  1316. import logging
  1317. import json
  1318. global json,logging
  1319. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1320. def process(self,group_id1, json_list_doc1,group_id2,json_list_doc2):
  1321. #y=x,少掉近一半的数据
  1322. if group_id1>=group_id2:
  1323. list_doc1 = json.loads(json_list_doc1)
  1324. list_doc2 = json.loads(json_list_doc2)
  1325. for _doc1 in list_doc1:
  1326. for _doc2 in list_doc2:
  1327. #同一个重复group不做判断
  1328. if _doc1["id"]!=_doc2["id"]:
  1329. #判断两个group是否有重复
  1330. _set1 = set()
  1331. for _item1 in _doc1["json_set_docid"]:
  1332. _set1.add(_item1["docid"])
  1333. _set2 = set()
  1334. for _item2 in _doc2["json_set_docid"]:
  1335. _set2.add(_item2["docid"])
  1336. if len(_set1&_set2)>0:
  1337. new_json_set_docid = _doc1["json_set_docid"]
  1338. for _item2 in _doc2["json_set_docid"]:
  1339. if _item2["docid"] not in _set1:
  1340. new_json_set_docid.append(_item2)
  1341. self.forward(_doc1["id"],_doc2["id"],json.dumps(new_json_set_docid))
  1342. def getBestDocid(list_pair):
  1343. # [docid1,extract_count1,docid2,extract_count2]
  1344. # list_pair.sort(key=lambda x:x[3],reverse=True)
  1345. # _max_count = max(list_pair[0][3],list_pair[0][1])
  1346. # set_candidate = set()
  1347. # if list_pair[0][1]==_max_count:
  1348. # set_candidate.add(list_pair[0][0])
  1349. # for item in list_pair:
  1350. # if item[3]==_max_count:
  1351. # set_candidate.add(item[2])
  1352. # else:
  1353. # break
  1354. # list_candidate = list(set_candidate)
  1355. # list_candidate.sort(key=lambda x:x)
  1356. new_pair = []
  1357. new_pair.append([list_pair[0][0],list_pair[0][0],list_pair[0][1]])
  1358. for item in list_pair:
  1359. new_pair.append([item[0],item[2],item[3]])
  1360. new_pair.sort(key=lambda x:x[1])
  1361. new_pair.sort(key=lambda x:x[2],reverse=True)
  1362. return new_pair[0][1]
  1363. @annotate('bigint,bigint,bigint,bigint->string')
  1364. class choose_document(BaseUDAF):
  1365. '''
  1366. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  1367. '''
  1368. def __init__(self):
  1369. import json
  1370. global json
  1371. def new_buffer(self):
  1372. return [[]]
  1373. def iterate(self, buffer,docid1,extract_count1,docid2,extract_count2):
  1374. buffer[0].append([docid1,extract_count1,docid2,extract_count2])
  1375. def merge(self, buffer, pbuffer):
  1376. buffer[0].extend(pbuffer[0])
  1377. def terminate(self, buffer):
  1378. list_pair = buffer[0]
  1379. _set = set()
  1380. for item in buffer[0]:
  1381. _set.add(str(item[2]))
  1382. list_dumplicate = list(_set)
  1383. best_docid = getBestDocid(list_pair)
  1384. if best_docid==list_pair[0][0]:
  1385. save_flag = 1
  1386. else:
  1387. save_flag = 0
  1388. return json.dumps({"save_flag":save_flag,"dumplicates":list_dumplicate})
  1389. @annotate('string -> bigint,string')
  1390. class f_get_choose_document(BaseUDTF):
  1391. '''
  1392. 将多个组拆解成多条记录
  1393. '''
  1394. def __init__(self):
  1395. import logging
  1396. import json
  1397. global json,logging
  1398. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1399. def process(self,json_choose):
  1400. if json_choose is None:
  1401. self.forward(1,None)
  1402. else:
  1403. _choose = json.loads(json_choose)
  1404. self.forward(_choose["save_flag"],",".join(_choose["dumplicates"]))
  1405. @annotate('bigint,bigint,bigint,bigint->string')
  1406. class group_document_bestFirst(BaseUDAF):
  1407. '''
  1408. 将组里面最优的放在前面
  1409. '''
  1410. def __init__(self):
  1411. import json
  1412. global json
  1413. def new_buffer(self):
  1414. return [[]]
  1415. def iterate(self, buffer,docid1,extract_count1,docid2,extract_count2):
  1416. buffer[0].append([docid1,extract_count1,docid2,extract_count2])
  1417. def merge(self, buffer, pbuffer):
  1418. buffer[0].extend(pbuffer[0])
  1419. def terminate(self, buffer):
  1420. list_pair = buffer[0]
  1421. _set = set()
  1422. for item in buffer[0]:
  1423. _set.add(item[2])
  1424. _set.add(list_pair[0][0])
  1425. best_docid = getBestDocid(list_pair)
  1426. _set.remove(best_docid)
  1427. list_dumplicate = list(_set)
  1428. list_dumplicate.sort(key=lambda x:x)
  1429. list_dumplicate.insert(0,best_docid)
  1430. list_dumplicate_str = []
  1431. for item in list_dumplicate:
  1432. list_dumplicate_str.append(str(item))
  1433. return ",".join(list_dumplicate_str)
  1434. @annotate('string -> bigint,string')
  1435. class f_get_best_dumplicates(BaseUDTF):
  1436. '''
  1437. 得到每个分组中最优的那一条及其重复记录
  1438. '''
  1439. def __init__(self):
  1440. import logging
  1441. import json
  1442. global json,logging
  1443. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1444. def process(self,list_dumplicate_str):
  1445. if list_dumplicate_str is None or list_dumplicate_str=='':
  1446. pass
  1447. else:
  1448. list_dumplicate = list_dumplicate_str.split(",")
  1449. if len(list_dumplicate)>0:
  1450. self.forward(int(list_dumplicate[0]),",".join(list_dumplicate[1:]))
  1451. else:
  1452. pass
  1453. @annotate('bigint,bigint->string')
  1454. class bridge2group(BaseUDAF):
  1455. '''
  1456. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  1457. '''
  1458. def __init__(self):
  1459. import json
  1460. global json
  1461. def new_buffer(self):
  1462. return [set()]
  1463. def iterate(self, buffer,docid1,docid2):
  1464. buffer[0].add(docid1)
  1465. buffer[0].add(docid2)
  1466. def merge(self, buffer, pbuffer):
  1467. buffer[0] |= pbuffer[0]
  1468. def terminate(self, buffer):
  1469. list_pair = list(buffer[0])
  1470. list_pair.sort(key=lambda x:x,reverse=True)
  1471. return json.dumps(list_pair)
  1472. @annotate('string -> bigint,bigint')
  1473. class group2bridge(BaseUDTF):
  1474. '''
  1475. 将多个组拆解成多条记录
  1476. '''
  1477. def __init__(self):
  1478. import logging
  1479. import json
  1480. global json,logging
  1481. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1482. def process(self,json_list_docid):
  1483. list_docid = json.loads(json_list_docid)
  1484. for _docid in list_docid:
  1485. self.forward(list_docid[-1],_docid)
  1486. @annotate('string->string')
  1487. class to_url(object):
  1488. def evaluate(self,_s):
  1489. if _s is None or _s=="":
  1490. return
  1491. else:
  1492. list_l = []
  1493. for l in _s.split(","):
  1494. list_l.append("http://www.bidizhaobiao.com/info-%s.html"%l)
  1495. return ",".join(list_l)
  1496. @annotate('bigint,bigint,string -> bigint')
  1497. class f_get_dump_docid(BaseUDTF):
  1498. '''
  1499. 将多个组拆解成多条记录
  1500. '''
  1501. def __init__(self):
  1502. import logging
  1503. import json
  1504. global json,logging
  1505. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1506. def process(self,docid,save_flag,dumplicates):
  1507. if save_flag==0:
  1508. self.forward(docid)
  1509. if dumplicates is not None:
  1510. list_docid = dumplicates.split(",")
  1511. if len(list_docid)>0:
  1512. for _docid in list_docid[1:]:
  1513. self.forward(int(_docid))
  1514. else:
  1515. if dumplicates is not None:
  1516. list_docid = dumplicates.split(",")
  1517. if len(list_docid)>0:
  1518. for _docid in list_docid:
  1519. self.forward(int(_docid))
  1520. @annotate('string -> bigint,bigint')
  1521. class f_get_docid(BaseUDTF):
  1522. '''
  1523. 将多个组拆解成多条记录
  1524. '''
  1525. def __init__(self):
  1526. import logging
  1527. import json
  1528. global json,logging
  1529. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1530. def process(self,json_set_docid):
  1531. team_id = 0
  1532. if json_set_docid is not None:
  1533. list_docses = json.loads(json_set_docid)
  1534. for list_docs in list_docses:
  1535. team_id += 1
  1536. for item in list_docs:
  1537. self.forward(team_id,item["docid"])
  1538. @annotate("string->bigint")
  1539. class get_count_dump(object):
  1540. def __init__(self):
  1541. import logging
  1542. import re
  1543. global logging,re
  1544. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1545. def evaluate(self, title):
  1546. _count = 0
  1547. if title is not None:
  1548. _count = len(title.split(","))
  1549. return _count
  1550. def getSet(list_dict,key):
  1551. _set = set()
  1552. for item in list_dict:
  1553. if key in item:
  1554. if item[key]!='' and item[key] is not None:
  1555. if re.search("^\d[\d\.]*$",item[key]) is not None:
  1556. _set.add(str(float(item[key])))
  1557. else:
  1558. _set.add(str(item[key]))
  1559. return _set
  1560. def getDiffIndex(list_dict,key,confidence=100):
  1561. '''
  1562. 优化为相似度判断
  1563. :param list_dict:
  1564. :param key:
  1565. :param confidence:
  1566. :return:
  1567. '''
  1568. # _set = set()
  1569. # for _i in range(len(list_dict)):
  1570. # item = list_dict[_i]
  1571. # if item["confidence"]>=confidence:
  1572. # continue
  1573. # if key in item:
  1574. # if item[key]!='' and item[key] is not None:
  1575. # if re.search("^\d+(\.\d+)?$",item[key]) is not None:
  1576. # _set.add(str(float(item[key])))
  1577. # else:
  1578. # _set.add(str(item[key]))
  1579. # if len(_set)>1:
  1580. # return _i
  1581. # ==============================
  1582. _set = set()
  1583. _set_m = set()
  1584. base_s = ""
  1585. for _i in range(len(list_dict)):
  1586. item = list_dict[_i]
  1587. if item["confidence"]>=confidence:
  1588. continue
  1589. if key in item:
  1590. if item[key]!='' and item[key] is not None:
  1591. if re.search("^\d+(\.\d+)?$",item[key]) is not None:
  1592. _m = float(item[key])
  1593. if _m>100000:
  1594. _m = _m//10000*10000
  1595. _set_m.add(str(_m))
  1596. else:
  1597. _s = str(item[key])
  1598. if base_s=="":
  1599. base_s = _s
  1600. else:
  1601. simi = getSimilarityOfString(base_s,_s)
  1602. if simi<0.8:
  1603. return _i
  1604. if len(_set_m)>1:
  1605. return _i
  1606. return len(list_dict)
  1607. @annotate('bigint,string -> bigint,bigint')
  1608. class f_getGroup_dumpFinal(BaseUDTF):
  1609. '''
  1610. 从最后的结果中获取组
  1611. '''
  1612. def __init__(self):
  1613. import logging
  1614. import json
  1615. global json,logging
  1616. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1617. def process(self,docid,dumplicates):
  1618. self.forward(int(docid),int(docid))
  1619. if dumplicates is not None:
  1620. list_docids = dumplicates.split(",")
  1621. for _docid in list_docids:
  1622. self.forward(int(docid),int(_docid))
  1623. @annotate('bigint,bigint,string,string,string,string,bigint,bigint,bigint->string')
  1624. class f_redump_limit_num(BaseUDAF):
  1625. '''
  1626. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1627. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1628. '''
  1629. def __init__(self):
  1630. import logging
  1631. import json,re
  1632. global json,logging,re
  1633. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1634. def new_buffer(self):
  1635. return [list()]
  1636. def iterate(self, buffer,main_docid,docid,doctitle,set_limit_column2,set_limit_column3,set_limit_column4,extract_count1,extract_count2,confidence):
  1637. buffer[0].append({"main_docid":main_docid,"docid":docid,"doctitle":doctitle,"set_limit_column2":set_limit_column2,
  1638. "set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,"extract_count1":extract_count1,
  1639. "extract_count2":extract_count2,"confidence":confidence})
  1640. def merge(self, buffer, pbuffer):
  1641. buffer[0].extend(pbuffer[0])
  1642. def terminate(self, buffer):
  1643. list_group = []
  1644. the_group = buffer[0]
  1645. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  1646. if len(the_group)>5:
  1647. keys = ["doctitle","set_limit_column2","set_limit_column3","set_limit_column4"]
  1648. else:
  1649. keys = ["set_limit_column2","set_limit_column3","set_limit_column4"]
  1650. final_group = []
  1651. #置信度
  1652. list_key_index = []
  1653. for _k in keys:
  1654. if _k=="doctitle":
  1655. list_key_index.append(getDiffIndex(the_group,_k,confidence=30))
  1656. else:
  1657. list_key_index.append(getDiffIndex(the_group,_k))
  1658. _index = min(list_key_index)
  1659. if _index>1:
  1660. main_docid = the_group[0]["main_docid"]
  1661. for item in the_group[:_index]:
  1662. if item["docid"]!=main_docid:
  1663. final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"],"confidence":item["confidence"]})
  1664. # stay = True
  1665. # for _key in keys:
  1666. # if len(getSet(the_group,_key))>1:
  1667. # stay = False
  1668. # break
  1669. #
  1670. # if stay:
  1671. # main_docid = the_group[0]["main_docid"]
  1672. # for item in the_group:
  1673. # if item["docid"]!=main_docid:
  1674. # final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"]})
  1675. return json.dumps(final_group)
  1676. @annotate('string -> bigint,bigint,bigint,bigint,bigint')
  1677. class f_get_dumpFinal_checked(BaseUDTF):
  1678. '''
  1679. 从最后的结果中获取组
  1680. '''
  1681. def __init__(self):
  1682. import logging
  1683. import json
  1684. global json,logging
  1685. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1686. def process(self,list_group):
  1687. if list_group is not None:
  1688. final_group = json.loads(list_group)
  1689. for _group in final_group:
  1690. self.forward(_group["docid1"],_group["docid2"],_group["extract_count1"],_group["extract_count2"],_group["confidence"])
  1691. @annotate('string -> bigint')
  1692. class f_getDumplicateDocids(BaseUDTF):
  1693. '''
  1694. 从最后的结果中获取组
  1695. '''
  1696. def __init__(self):
  1697. import logging
  1698. import json
  1699. global json,logging
  1700. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1701. def process(self,dumplicates):
  1702. list_docids = dumplicates.split(",")
  1703. for _d in list_docids:
  1704. self.forward(int(_d))
  1705. def getSimilarityOfString(str1,str2):
  1706. _set1 = set()
  1707. _set2 = set()
  1708. if str1 is not None:
  1709. for i in range(1,len(str1)):
  1710. _set1.add(str1[i-1:i+1])
  1711. for i in range(2,len(str1)):
  1712. _set1.add(str1[i-2:i+1])
  1713. if str2 is not None:
  1714. for i in range(1,len(str2)):
  1715. _set2.add(str2[i-1:i+1])
  1716. for i in range(2,len(str2)):
  1717. _set2.add(str2[i-2:i+1])
  1718. _len = max(1,min(len(_set1),len(_set2)))
  1719. return len(_set1&_set2)/_len
  1720. @annotate("string,string,string,string,string,string,string,string,string,string->bigint")
  1721. class f_is_legal(object):
  1722. def __init__(self):
  1723. import logging
  1724. import re
  1725. global logging,re
  1726. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1727. def evaluate(self, tenderee1,tenderee2,bidding_budget1,budding_budget2,win_tenderee1,win_tenderee2,win_bid_price1,win_bid_price2,project_code1,project_code2):
  1728. if tenderee1 is not None and tenderee1!="" and tenderee2 is not None and tenderee2!="" and tenderee1!=tenderee2:
  1729. return 0
  1730. if bidding_budget1 is not None and bidding_budget1!="" and budding_budget2 is not None and budding_budget2!="" and bidding_budget1!=budding_budget2:
  1731. return 0
  1732. if win_tenderee1 is not None and win_tenderee1!="" and win_tenderee2 is not None and win_tenderee2!="" and win_tenderee1!=win_tenderee2:
  1733. return 0
  1734. if win_bid_price1 is not None and win_bid_price1!="" and win_bid_price2 is not None and win_bid_price2!="" and win_bid_price1!=win_bid_price2:
  1735. return 0
  1736. _sim = getSimilarityOfString(project_code1,project_code2)
  1737. if _sim>0.7 and _sim<1:
  1738. return 0
  1739. return 1
  1740. @annotate('bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint->string')
  1741. class f_autorule_group(BaseUDAF):
  1742. '''
  1743. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1744. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1745. '''
  1746. def __init__(self):
  1747. import logging
  1748. import json,re
  1749. global json,logging,re
  1750. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1751. def new_buffer(self):
  1752. return [list()]
  1753. def iterate(self, buffer,main_docid,docid,docchannel,doctitle,doctitle_refine,area,province,city,district,web_source_no,fingerprint,
  1754. project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count1,extract_count2,confidence):
  1755. buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"doctitle":doctitle,
  1756. "doctitle_refine":doctitle_refine,"area":area,"province":province,
  1757. "city":city,"district":district,"web_source_no":web_source_no,"fingerprint":fingerprint,
  1758. "project_code":project_code,"project_name":project_name,"tenderee":tenderee,"agency":agency,
  1759. "win_tenderer":win_tenderer,"bidding_budget":bidding_budget,"win_bid_price":win_bid_price,
  1760. "extract_count1":extract_count1,"extract_count2":extract_count2,"confidence":confidence})
  1761. def merge(self, buffer, pbuffer):
  1762. buffer[0].extend(pbuffer[0][:100])
  1763. buffer[0] = buffer[0][:100]
  1764. def getSameKeys(self,_dict1,_dict2):
  1765. list_keys = []
  1766. for k,v in _dict1.items():
  1767. if k in ["area","city","confidence","district","extract_count1","extract_count2","main_docid","province"]:
  1768. continue
  1769. v2 = _dict2.get(k,"")
  1770. if v is not None and v!="" and v2 is not None and v2!="" and v==v2:
  1771. list_keys.append(k)
  1772. list_keys.sort(key=lambda x:x)
  1773. return "=".join(list_keys)
  1774. def terminate(self, buffer):
  1775. list_group = []
  1776. the_group = buffer[0]
  1777. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  1778. if len(the_group)>5:
  1779. keys = ["doctitle","tenderee","win_tenderer","bidding_budget","win_bid_price"]
  1780. else:
  1781. keys = ["tenderee","win_tenderer","bidding_budget","win_bid_price"]
  1782. #置信度
  1783. list_key_index = []
  1784. for _k in keys:
  1785. if _k=="doctitle":
  1786. list_key_index.append(getDiffIndex(the_group,_k,confidence=30))
  1787. else:
  1788. list_key_index.append(getDiffIndex(the_group,_k))
  1789. final_group = []
  1790. _index = min(list_key_index)
  1791. if _index>1:
  1792. for item in the_group[:_index]:
  1793. final_group.append(item)
  1794. list_rules = []
  1795. for i in range(len(final_group)):
  1796. for j in range(i+1,len(final_group)):
  1797. _dict1 = final_group[i]
  1798. _dict2 = final_group[j]
  1799. _rule = self.getSameKeys(_dict1,_dict2)
  1800. list_rules.append([_rule,_dict1.get("docid"),_dict2.get("docid")])
  1801. return json.dumps(list_rules)
  1802. @annotate('string -> string,bigint,bigint')
  1803. class f_autorule_group_extract(BaseUDTF):
  1804. '''
  1805. 从最后的结果中获取组
  1806. '''
  1807. def __init__(self):
  1808. import logging
  1809. import json
  1810. global json,logging
  1811. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1812. def process(self,rules_json):
  1813. list_rules = json.loads(rules_json)
  1814. for _rule in list_rules:
  1815. self.forward(_rule[0],_rule[1],_rule[2])
  1816. if __name__ == '__main__':
  1817. # _str1 = "SXXY-ZBP-GG-2020002"
  1818. # _str2 = "SXXY-ZBP-GG-2020002"
  1819. # print(getSimilarityOfString(_str1,_str2))
  1820. print(check_doctitle("南京市秦淮新河沿线泰山公寓、天虹山庄、福润雅居南区小区环境综合整治","(雨花台区)秦淮新河沿线泰山公寓、天虹山庄、福润雅居南区小区环境综合整治勘察设计"))
  1821. # print(check_product(None,None))
  1822. # print(check_code("4451020073383382206021325","4451020073383382206021322"))
  1823. # print(check_money("550.0","440.0","",""))
  1824. # for i in range(0,2):
  1825. # print(i)
  1826. # location_pattern = re.compile(".{1,2}市|.{1,2}区|.{1,2}镇|.{1,2}县|.{1,2}村")
  1827. # print(re.findall(location_pattern,"宁古线乡村振兴高优农业融合发展建设项目(洋中镇前路富代都村示范点农用塑料薄膜棚)"))
  1828. # print(re.findall(location_pattern,"宁古线乡村振兴高优农业融合发展建设项目(洋中镇天湖村粮蔬基地农用塑料薄膜棚)"))
  1829. # package_number_pattern = re.compile("(?P<name>(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.]?)[^至]?|((?![\.])第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包)))") # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
  1830. # _match = re.search(package_number_pattern,"2021年盘山县高标准农田建设项目三标段(高升街道)开标记录")
  1831. # if _match is not None:
  1832. # print(_match.groupdict()["name"])
  1833. # print(re.findall("((标[段号的包])[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4})","[南宁市]桂林银行南宁办公大楼装修工程标段Ⅲ"))
  1834. # print(check_doctitle("[南宁市]桂林银行南宁办公大楼装修工程标段Ⅲ","桂林银行南宁办公大楼装修工程标段ⅡGXYLG20182005-N中标公告"))
  1835. # c = f_get_extractCount()
  1836. # _json = '''
  1837. # { "attachmentTypes": "", "bidway": "", "code": [ "LCQTCG-2022-313" ], "cost_time": { "attrs": 0.02, "codename": 0.16, "deposit": 0.0, "nerToken": 0.8400000000000001, "person": 0.01, "prem": 0.02, "preprocess": 0.96, "product": 0.12, "product_attrs": 0.01, "punish": 0.11, "roleRuleFinal": 0.0, "rule": 0.0, "rule_channel": 0.0, "tableToText": 0.09000381469726562, "tendereeRuleRecall": 0.0, "time": 0.01, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "docchannel": { "docchannel": "招标公告", "doctype": "采招数据" }, "docid": "", "doctitle_refine": "郑济高铁聊城西站配套基础设施建设项目一期枢纽功能区建设(一标段)膨胀剂(暂估价)项目", "exist_table": 1, "extract_count": 5, "fail_reason": "", "fingerprint": "md5=b1ab0ee9cf9e1c5acc17477b9c0433cc", "match_enterprise": [], "match_enterprise_type": 0, "moneysource": "", "name": "郑济高铁聊城西站配套基础设施建设项目一期枢纽功能区建设工程(一标段)膨胀剂(暂估价)采购项目", "nlp_enterprise": [ "中建八局第一建设有限公司", "山东东岳项目管理有限公司", "聊城市公共资源交易中心", "江苏国泰新点软件有限公司" ], "person_review": [], "prem": { "Project": { "code": "", "roleList": [ { "linklist": [ [ "", "15540110649" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "tenderee", "role_text": "中建八局第一建设有限公司", "serviceTime": "" }, { "linklist": [ [ "武工", "0635-2992305" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "agency", "role_text": "山东东岳项目管理有限公司", "serviceTime": "" } ], "tendereeMoney": 0, "tendereeMoneyUnit": "" }, "一": { "code": "", "roleList": [], "tendereeMoney": 3267000.0, "tendereeMoneyUnit": "万元" } }, "process_time": "2022-05-30 14:31:13", "product": [ "枢纽功能区建设工程", "膨胀剂", "配套基础设施建设" ], "product_attrs": { "data": [], "header": [], "header_col": [] }, "serviceTime": "", "success": true, "time_bidclose": "2022-06-16", "time_bidopen": "2022-06-16", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "2022-06-01", "time_getFileStart": "2022-05-26", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "2022-05-25", "total_tendereeMoney": 0, "total_tendereeMoneyUnit": "" }
  1838. # '''
  1839. # c = f_get_nlp_enterprise()
  1840. # print(c.evaluate("山东东岳项目管理有限公司",_json))
  1841. # print(c.evaluate(_json))
  1842. # c = f_set_docid()
  1843. # _s = '''
  1844. # 154064190 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  1845. # 154064188 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  1846. # 154064175 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  1847. # 30201228 1512489600 4 04111-1 1 大连市妇女儿童医疗中心
  1848. # 154064160 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  1849. # 154064168 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  1850. # '''
  1851. # buffer = c.new_buffer()
  1852. # for _line in _s.split("\n"):
  1853. # _line = _line.strip()
  1854. # if _line=="":
  1855. # continue
  1856. # l_column = _line.split("\t")
  1857. # print(l_column)
  1858. # docid,page_time_stamp,extract_count,web_source_no,num,tenderee = l_column
  1859. # page_time_stamp = int(page_time_stamp)
  1860. # extract_count = int(extract_count)
  1861. # num = 1
  1862. # c.iterate(buffer,docid,page_time_stamp,extract_count,web_source_no,num,tenderee)
  1863. # print(c.terminate(buffer))