getAttributes.py 321 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257
  1. # from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date,API_URL,uniform_package_name,money_process,getDigitsDic,isValidDate
  2. from BiddingKG.dl.common.Utils import *
  3. from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
  4. from decimal import Decimal
  5. import re
  6. import copy
  7. import math
  8. import pandas as pd
  9. import os
  10. from scipy.optimize import linear_sum_assignment
  11. from BiddingKG.dl.interface.Entitys import Match
  12. import numpy as np
  13. import uuid
  14. import time,calendar
  15. from datetime import datetime
  16. import json
  17. from BiddingKG.dl.entityLink.entityLink import is_enterprise_exist
  18. def getTheRole(entity,role_list):
  19. '''
  20. @summary:根据实体名称拿到index
  21. @param:
  22. entity:实体名称
  23. role_list:角色list
  24. @return:该实体所在下标
  25. '''
  26. for role_index in range(len(role_list)):
  27. if entity in role_list[role_index]:
  28. return role_index
  29. return None
  30. dict_role_id = {"0":"tenderee",
  31. "1":"agency",
  32. "2":"win_tenderer",
  33. "3":"second_tenderer",
  34. "4":"third_tenderer"}
  35. role2id_dict = {"tenderee":0,
  36. "agency":1,
  37. "win_tenderer":2,
  38. "second_tenderer":3,
  39. "third_tenderer":4}
  40. def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None):
  41. '''
  42. @param:
  43. packageList:文章的包的信息,包号-sent_index-词偏移-字偏移-[[前作用域句子,句内偏移],[后作用域句子,句内偏移]]-匹配集合
  44. sentence_index:实体所在的句子
  45. begin_index:实体所在句子的起始位置
  46. @return:公司实体所属的包
  47. @summary: 优化多标段,确定标段作用域之后,寻找作用域包含该实体的所有包,从前往后找到一个还没有该roleid的包返回,若找到的包都有roleid,则返回第一个,若没有找到包,返回None
  48. '''
  49. '''
  50. if len(packageList)==0:
  51. return None
  52. before_index = None
  53. after_index = None
  54. equal_index = None
  55. equal_count = 0
  56. for pack_index in range(len(packageList)):
  57. if packageList[pack_index][1]>sentence_index and after_index is None:
  58. after_index = pack_index
  59. if packageList[pack_index][1]<sentence_index:
  60. before_index = pack_index
  61. if packageList[pack_index][1]==sentence_index and equal_index is None:
  62. equal_index = pack_index
  63. #当前句子和之前句子未找到包
  64. if before_index is None and equal_index is None:
  65. return None
  66. else:
  67. if after_index is None:
  68. end_index = len(packageList)
  69. else:
  70. end_index = after_index
  71. #只在当前句子找到一个包号
  72. if end_index-max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1))==1:
  73. return packageList[end_index-1][0]
  74. else:
  75. for i in range(max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1)),end_index):
  76. if packageList[i][2]>int(begin_index):
  77. if packageList[i-1][4]:
  78. return packageList[i-1][0]
  79. else:
  80. if packageList[i][4]:
  81. return packageList[i-1][0]
  82. else:
  83. return packageList[i][0]
  84. return packageList[end_index-1][0]
  85. '''
  86. if len(packageList)==0:
  87. return None,False
  88. list_legalPack = []
  89. for pack_index in range(len(packageList)):
  90. if DIRECT=="L" and (packageList[pack_index]["sentence_index"]>sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetWords_begin"]>begin_index)):
  91. continue
  92. if DIRECT=="R" and (packageList[pack_index]["sentence_index"]<sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetwords_begin"]<begin_index)):
  93. continue
  94. if (packageList[pack_index]["scope"][0][0]<sentence_index or (packageList[pack_index]["scope"][0][0]==sentence_index and packageList[pack_index]["scope"][0][1]<=begin_index)) and (packageList[pack_index]["scope"][1][0]>sentence_index or (packageList[pack_index]["scope"][1][0]==sentence_index and packageList[pack_index]["scope"][1][1]>=begin_index)):
  95. if MAX_DIS is not None:
  96. if abs(sentence_index-packageList[pack_index]["sentence_index"])<=MAX_DIS:
  97. list_legalPack.append(pack_index)
  98. else:
  99. list_legalPack.append(pack_index)
  100. # if (packageList[pack_index]["scope"][0][0] < sentence_index
  101. # or (packageList[pack_index]["scope"][0][0] == sentence_index
  102. # and packageList[pack_index]["scope"][0][1] <= begin_index))
  103. # and (packageList[pack_index]["scope"][1][0] > sentence_index
  104. # or (packageList[pack_index]["scope"][1][0] == sentence_index
  105. # and packageList[pack_index]["scope"][1][1] >= begin_index)):
  106. # pass
  107. _flag = True
  108. for _index in list_legalPack:
  109. if roleid in packageList[_index]["hit"]:
  110. continue
  111. else:
  112. _flag = False
  113. packageList[_index]["hit"].add(roleid)
  114. return packageList[_index]["pointer"],_flag
  115. if len(list_legalPack)>0:
  116. return packageList[0]["pointer"],_flag
  117. return None,False
  118. #生成合法的组合
  119. def get_legal_comba(list_entity,dict_role_combination):
  120. #拿到一个包中所有合法的组合
  121. def circle_package(_dict_legal_combination):
  122. list_dict_role_first = []
  123. for _role in _dict_legal_combination:
  124. if len(list_dict_role_first)==0:
  125. for _entity in _dict_legal_combination[_role]:
  126. if _entity !="":
  127. list_dict_role_first.append({_role:_entity})
  128. else:
  129. list_dict_role_after = []
  130. _find_count = 0
  131. for _entity in _dict_legal_combination[_role]:
  132. if _entity !="":
  133. for _dict in list_dict_role_first:
  134. _flag = True
  135. for _key1 in _dict:
  136. if _entity==_dict[_key1]:
  137. #修改为招标人和代理人可以为同一个
  138. if str(_key1) in ["0","1"] and str(_role) in ["0","1"]:
  139. _flag = True
  140. else:
  141. _flag = False
  142. if _flag:
  143. _find_count += 1
  144. _new_dict = copy.copy(_dict)
  145. _new_dict[_role] = _entity
  146. if len(list_dict_role_after)>100000:
  147. break
  148. list_dict_role_after.append(_new_dict)
  149. else:
  150. # 2021/5/25 update,同一实体(entity_text)不同角色
  151. if len(list_dict_role_after) > 100000:
  152. break
  153. for _dict in list_dict_role_first:
  154. for _key1 in _dict:
  155. if _entity == _dict[_key1]:
  156. _new_dict = copy.copy(_dict)
  157. _new_dict.pop(_key1)
  158. _new_dict[_role] = _entity
  159. list_dict_role_after.append({_role:_entity})
  160. if len(list_dict_role_after)==0:
  161. pass
  162. else:
  163. list_dict_role_first.extend(list_dict_role_after)
  164. return list_dict_role_first
  165. def recursive_package(_dict_legal_combination,set_legal_entity,dict_one_selution,list_all_selution):
  166. last_layer = False
  167. #若是空组合则放回空
  168. if len(_dict_legal_combination.keys())==0:
  169. return []
  170. #递归到最后一层则修改状态
  171. if len(_dict_legal_combination.keys())==1:
  172. last_layer = True
  173. #取一个角色开始进行遍历
  174. _key_role = list(_dict_legal_combination.keys())[0]
  175. for item in _dict_legal_combination[_key_role]:
  176. copy_dict_one_selution = copy.copy(dict_one_selution)
  177. copy_dict_legal_combination = {}
  178. copy_set_legal_entity = copy.copy(set_legal_entity)
  179. #复制余下的所有角色,进行下一轮递归
  180. for _key in _dict_legal_combination.keys():
  181. if _key!=_key_role:
  182. copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
  183. #修改为招标人和代理人可以为同一个
  184. if item !="":
  185. _flag = True
  186. if str(_key_role) in ["0","1"]:
  187. for _key_flag in copy_dict_one_selution:
  188. if _key_flag not in ["0","1"] and copy_dict_one_selution[_key_flag]==item:
  189. _flag = False
  190. else:
  191. for _key_flag in copy_dict_one_selution:
  192. if copy_dict_one_selution[_key_flag]==item:
  193. _flag = False
  194. if _flag:
  195. copy_dict_one_selution[_key_role] = item
  196. '''
  197. if item not in copy_set_legal_entity:
  198. if item !="":
  199. copy_dict_one_selution[_key_role] = item
  200. '''
  201. copy_set_legal_entity.add(item)
  202. if last_layer:
  203. list_all_selution.append(copy_dict_one_selution)
  204. else:
  205. recursive_package(copy_dict_legal_combination,copy_set_legal_entity,copy_dict_one_selution,list_all_selution)
  206. #递归匹配各个包的结果
  207. def recursive_packages(_dict_legal_combination,dict_one_selution,list_all_selution):
  208. last_layer = False
  209. if len(_dict_legal_combination.keys())==0:
  210. return []
  211. if len(_dict_legal_combination.keys())==1:
  212. last_layer = True
  213. _key_pack = list(_dict_legal_combination.keys())[0]
  214. for item in _dict_legal_combination[_key_pack]:
  215. copy_dict_one_selution = copy.copy(dict_one_selution)
  216. copy_dict_legal_combination = {}
  217. for _key in _dict_legal_combination.keys():
  218. if _key!=_key_pack:
  219. copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
  220. for _key_role in item.keys():
  221. copy_dict_one_selution[_key_pack+"$$"+_key_role] = item[_key_role]
  222. if last_layer:
  223. list_all_selution.append(copy_dict_one_selution)
  224. else:
  225. recursive_packages(copy_dict_legal_combination,copy_dict_one_selution,list_all_selution)
  226. return list_all_selution
  227. #循环获取所有包组合
  228. def circle_pageages(_dict_legal_combination):
  229. list_all_selution = []
  230. for _key_pack in _dict_legal_combination.keys():
  231. list_key_selution = []
  232. for item in _dict_legal_combination[_key_pack]:
  233. _dict = dict()
  234. for _key_role in item.keys():
  235. _dict[_key_pack+"$$"+_key_role] = item[_key_role]
  236. list_key_selution.append(_dict)
  237. if len(list_all_selution)==0:
  238. list_all_selution = list_key_selution
  239. else:
  240. _list_all_selution = []
  241. for item_1 in list_all_selution:
  242. for item_2 in list_key_selution:
  243. _list_all_selution.append(dict(item_1,**item_2))
  244. list_all_selution = _list_all_selution
  245. return list_all_selution
  246. #拿到各个包解析之后的结果
  247. _dict_legal_combination = {}
  248. for packageName in dict_role_combination.keys():
  249. _list_all_selution = []
  250. # recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution)
  251. _list_all_selution = circle_package(dict_role_combination[packageName])
  252. '''
  253. # print("===1")
  254. # print(packageName)
  255. for item in _list_all_selution:
  256. # print(item)
  257. # print("===2")
  258. '''
  259. #去除包含子集
  260. list_all_selution_simple = []
  261. _list_set_all_selution = []
  262. for item_selution in _list_all_selution:
  263. item_set_selution = set()
  264. for _key in item_selution.keys():
  265. item_set_selution.add((_key,item_selution[_key]))
  266. _list_set_all_selution.append(item_set_selution)
  267. if len(_list_set_all_selution)>1000:
  268. _dict_legal_combination[packageName] = _list_all_selution
  269. continue
  270. for i in range(len(_list_set_all_selution)):
  271. be_included = False
  272. for j in range(len(_list_set_all_selution)):
  273. if i!=j:
  274. if len(set(_list_set_all_selution[i])&set(_list_set_all_selution[j]))==len(_list_set_all_selution[i]) and len(_list_set_all_selution[i])!=len(_list_set_all_selution[j]):
  275. be_included = True
  276. if not be_included:
  277. list_all_selution_simple.append(_list_all_selution[i])
  278. _dict_legal_combination[packageName] = list_all_selution_simple
  279. _list_final_comba = []
  280. #对各个包的结果进行排列组合
  281. _comba_count = 1
  282. for _key in _dict_legal_combination.keys():
  283. _comba_count *= len(_dict_legal_combination[_key])
  284. #如果过大,则每个包只取概率最大的那个
  285. dict_pack_entity_prob = get_dict_entity_prob(list_entity)
  286. if _comba_count>250:
  287. new_dict_legal_combination = dict()
  288. for _key_pack in _dict_legal_combination.keys():
  289. MAX_PROB = -1000
  290. _MAX_PROB_COMBA = None
  291. for item in _dict_legal_combination[_key_pack]:
  292. # print(_key_pack,item)
  293. _dict = dict()
  294. for _key in item.keys():
  295. _dict[str(_key_pack)+"$$"+str(_key)] = item[_key]
  296. _prob = getSumExpectation(dict_pack_entity_prob, _dict)
  297. if _prob>MAX_PROB:
  298. MAX_PROB = _prob
  299. _MAX_PROB_COMBA = [item]
  300. if _MAX_PROB_COMBA is not None:
  301. new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA
  302. _dict_legal_combination = new_dict_legal_combination
  303. #recursive_packages(_dict_legal_combination, {}, _list_final_comba)
  304. _list_final_comba = circle_pageages(_dict_legal_combination)
  305. #除了Project包(招标人和代理人),其他包是不会有冲突的
  306. #查看是否有一个实体出现在了Project包和其他包中,如有,要进行裁剪
  307. _list_real_comba = []
  308. for dict_item in _list_final_comba:
  309. set_project = set()
  310. set_other = set()
  311. for _key in list(dict_item.keys()):
  312. if _key.split("$$")[0]=="Project":
  313. set_project.add(dict_item[_key])
  314. else:
  315. set_other.add(dict_item[_key])
  316. set_common = set_project&set_other
  317. if len(set_common)>0:
  318. dict_project = {}
  319. dict_not_project = {}
  320. for _key in list(dict_item.keys()):
  321. if dict_item[_key] in set_common:
  322. if str(_key.split("$$")[0])=="Project":
  323. dict_project[_key] = dict_item[_key]
  324. else:
  325. dict_not_project[_key] = dict_item[_key]
  326. else:
  327. dict_project[_key] = dict_item[_key]
  328. dict_not_project[_key] = dict_item[_key]
  329. _list_real_comba.append(dict_project)
  330. _list_real_comba.append(dict_not_project)
  331. else:
  332. _list_real_comba.append(dict_item)
  333. return _list_real_comba
  334. def get_dict_entity_prob(list_entity,on_value=0.5):
  335. dict_pack_entity_prob = {}
  336. for in_attachment in [False,True]:
  337. identified_role = []
  338. if in_attachment==True:
  339. identified_role = [value[0] for value in dict_pack_entity_prob.values()]
  340. for entity in list_entity:
  341. if entity.entity_type in ['org','company'] and entity.in_attachment==in_attachment:
  342. values = entity.values
  343. role_prob = float(values[int(entity.label)])
  344. _key = entity.packageName+"$$"+str(entity.label)
  345. if role_prob>=on_value and str(entity.label)!="5":
  346. _key_prob = _key+"$text$"+entity.entity_text
  347. if in_attachment == True:
  348. role_prob = 0.8 if role_prob>0.8 else role_prob #附件的概率修改低点
  349. # if entity.entity_text in identified_role: # 2023/7/3 注释掉,选取概率最大的作为连接概率
  350. # continue
  351. if _key_prob in dict_pack_entity_prob:
  352. # new_prob = role_prob+dict_pack_entity_prob[_key_prob][1] if role_prob>0.9 else max(role_prob, dict_pack_entity_prob[_key_prob][1])
  353. # dict_pack_entity_prob[_key_prob] = [entity.entity_text, new_prob] #公司同角色多次出现概率累计
  354. if role_prob>dict_pack_entity_prob[_key_prob][1]:
  355. dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
  356. else:
  357. dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
  358. return dict_pack_entity_prob
  359. #计算合计期望
  360. def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5):
  361. '''
  362. expect = 0
  363. for entity in list_entity:
  364. if entity.entity_type in ['org','company']:
  365. values = entity.values
  366. role_prob = float(values[int(entity.label)])
  367. _key = entity.packageName+"$$"+str(entity.label)
  368. if role_prob>on_value and str(entity.label)!="5":
  369. if _key in combination.keys() and combination[_key]==entity.entity_text:
  370. expect += math.pow(role_prob,4)
  371. else:
  372. expect -= math.pow(role_prob,4)
  373. '''
  374. #修改为同一个实体只取对应包-角色的最大的概率值
  375. expect = 0
  376. dict_entity_prob = {}
  377. for _key_pack_entity in dict_pack_entity_prob:
  378. _key_pack = _key_pack_entity.split("$text$")[0]
  379. role_prob = dict_pack_entity_prob[_key_pack_entity][1]
  380. if _key_pack in combination.keys() and combination[_key_pack]==dict_pack_entity_prob[_key_pack_entity][0]:
  381. if _key_pack_entity in dict_entity_prob.keys():
  382. if dict_entity_prob[_key_pack_entity]<role_prob:
  383. dict_entity_prob[_key_pack_entity] = role_prob
  384. else:
  385. dict_entity_prob[_key_pack_entity] = role_prob
  386. else:
  387. if _key_pack_entity in dict_entity_prob.keys():
  388. if dict_entity_prob[_key_pack_entity]>-role_prob:
  389. dict_entity_prob[_key_pack_entity] = -role_prob
  390. else:
  391. dict_entity_prob[_key_pack_entity] = -role_prob
  392. # for entity in list_entity:
  393. # if entity.entity_type in ['org','company']:
  394. # values = entity.values
  395. # role_prob = float(values[int(entity.label)])
  396. # _key = entity.packageName+"$$"+str(entity.label)
  397. # if role_prob>=on_value and str(entity.label)!="5":
  398. # if _key in combination.keys() and combination[_key]==entity.entity_text:
  399. # _key_prob = _key+entity.entity_text
  400. # if _key_prob in dict_entity_prob.keys():
  401. # if dict_entity_prob[_key_prob]<role_prob:
  402. # dict_entity_prob[_key_prob] = role_prob
  403. # else:
  404. # dict_entity_prob[_key_prob] = role_prob
  405. # else:
  406. # _key_prob = _key+entity.entity_text
  407. # if _key_prob in dict_entity_prob.keys():
  408. # if dict_entity_prob[_key_prob]>-role_prob:
  409. # dict_entity_prob[_key_prob] = -role_prob
  410. # else:
  411. # dict_entity_prob[_key_prob] = -role_prob
  412. for _key in dict_entity_prob.keys():
  413. symbol = 1 if dict_entity_prob[_key]>0 else -1
  414. expect += symbol*math.pow(dict_entity_prob[_key],2)
  415. return expect
  416. def getRoleList(list_sentence,list_entity,on_value = 0.5):
  417. '''
  418. @summary: 搜索树,得到所有不矛盾的角色组合,取合计期望值最大的作为结果返回
  419. @param:
  420. list_sentence:文章所有的sentence
  421. list_entity:文章所有的实体
  422. on_value:概率阈值
  423. @return:文章的角色list
  424. '''
  425. pack = getPackagesFromArticle(list_sentence,list_entity)
  426. if pack is None:
  427. return None
  428. # PackageList,PackageSet,dict_PackageCode = pack
  429. PackageList,PackageSet,dict_PackageCode,main_body_pack = pack
  430. #拿到所有可能的情况
  431. dict_role_combination = {}
  432. tenderee_or_agency_set = set() # 记录所有预测为招标或代理的实体集合
  433. win_tenderer_set = set() # 记录所有预测为中标的实体集合
  434. # print(PackageList)
  435. #拿到各个实体的packageName,packageCode
  436. main_contain_winner = False # 2024/10/11 判断正文是否包含中标人
  437. for entity in list_entity:
  438. if entity.entity_type in ['org','company'] and entity.label==2 and entity.values[entity.label]>0.7 and entity.in_attachment==False:
  439. main_contain_winner = True
  440. break
  441. for entity in list_entity:
  442. if entity.entity_type in ['org','company']:
  443. #限制附件里角色values[label]最大概率prob
  444. max_prob = 0.85
  445. if str(entity.label)!="5" and entity.in_attachment:
  446. if entity.values[entity.label]>max_prob:
  447. entity.values[entity.label] = max_prob
  448. #过滤掉字数小于3个的实体
  449. if len(entity.entity_text)<=3:
  450. continue
  451. values = entity.values
  452. role_prob = float(values[int(entity.label)])
  453. if role_prob>=on_value and str(entity.label)!="5":
  454. if main_contain_winner and entity.in_attachment and entity.label in [2,3,4]: # 2024/10/11 正文包含中标人,不再提取附件中标人 避免 例:504046747 附件角色OCR错字变两个标段
  455. continue
  456. if str(entity.label) in ["0","1"]:
  457. packageName = "Project"
  458. else:
  459. if len(PackageSet)>0:
  460. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"role-"+str(entity.label))
  461. if packagePointer is None:
  462. #continue
  463. packageName = "Project"
  464. # print(entity.entity_text, packageName,entity.sentence_index,entity.begin_index)
  465. else:
  466. #add pointer_pack
  467. entity.pointer_pack = packagePointer
  468. packageName = packagePointer.entity_text
  469. # print(entity.entity_text, packageName)
  470. else:
  471. packageName = "Project"
  472. find_flag = False
  473. if packageName in dict_PackageCode.keys():
  474. packageCode = dict_PackageCode[packageName]
  475. else:
  476. packageCode = ""
  477. entity.packageCode = packageCode
  478. role_name = dict_role_id.get(str(entity.label))
  479. entity.roleName = role_name
  480. entity.packageName = packageName
  481. if entity.packageName in dict_role_combination.keys():
  482. if str(entity.label) in dict_role_combination[entity.packageName].keys():
  483. dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
  484. else:
  485. dict_role_combination[entity.packageName][str(entity.label)] = set([entity.entity_text])
  486. else:
  487. dict_role_combination[entity.packageName] = {}
  488. #初始化空值
  489. roleIds = [0,1,2,3,4]
  490. for _roleId in roleIds:
  491. dict_role_combination[entity.packageName][str(_roleId)] = set([""])
  492. dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
  493. list_real_comba = get_legal_comba(list_entity,dict_role_combination)
  494. # print("===role_combination",dict_role_combination)
  495. # print("== real_comba",list_real_comba)
  496. #拿到最大期望值的组合
  497. max_index = 0
  498. max_expect = -100
  499. _index = 0
  500. dict_pack_entity_prob = get_dict_entity_prob(list_entity)
  501. for item_combination in list_real_comba:
  502. expect = getSumExpectation(dict_pack_entity_prob, item_combination)
  503. if expect>max_expect:
  504. max_index = _index
  505. max_expect = expect
  506. _index += 1
  507. RoleList = []
  508. RoleSet = set()
  509. if len(list_real_comba)>0:
  510. for _key in list_real_comba[max_index].keys():
  511. packageName = _key.split("$$")[0]
  512. label = _key.split("$$")[1]
  513. role_name = dict_role_id.get(str(label))
  514. entity_text = list_real_comba[max_index][_key]
  515. entity_prob = dict_pack_entity_prob.get(_key+'$text$'+entity_text, ['',0])[1]
  516. # entity_text = list_real_comba[max_index][_key][0]
  517. # entity_prob = list_real_comba[max_index][_key][1]
  518. if packageName in dict_PackageCode.keys():
  519. packagecode = dict_PackageCode.get(packageName)
  520. else:
  521. packagecode = ""
  522. RoleList.append(PREM(packageName,packagecode,role_name,entity_text,entity_prob,0,0.0,[]))
  523. if str(label) in ["0", "1"]:
  524. tenderee_or_agency_set.add(entity_text)
  525. elif str(label) in ["2"] and entity_prob > 0.8:
  526. win_tenderer_set.add(entity_text)
  527. # if len(list_real_comba) > 1 and label == '2': # 20240809 由于包号对应不上注销
  528. # multi_winner = []
  529. # for comba in list_real_comba:
  530. # tmp_ent = comba.get(_key, '')
  531. # tmp_prob = dict_pack_entity_prob.get(_key+'$text$'+tmp_ent, ['',0])[1]
  532. # if tmp_ent !='' and tmp_prob>0.8:
  533. # multi_winner.append(comba[_key])
  534. # if len(set(multi_winner)) > 1:
  535. # RoleList[-1].multi_winner = multi_winner
  536. # print('RoleList: ', RoleList)
  537. RoleSet.add(entity_text)
  538. #根据最优树来修正list_entity中角色对包的连接
  539. for _entity in list_entity:
  540. if _entity.pointer_pack is not None:
  541. _pack_name = _entity.pointer_pack.entity_text
  542. _find_flag = False
  543. for _prem in RoleList:
  544. if _prem.packageName==_pack_name and _prem.entity_text==_entity.entity_text:
  545. _find_flag = True
  546. if not _find_flag:
  547. _entity.pointer_pack = None
  548. return RoleList,RoleSet,PackageList,PackageSet,win_tenderer_set,tenderee_or_agency_set,main_body_pack
  549. def getPackageScopePattern():
  550. '''
  551. @summary: 获取包的作用域关键词
  552. '''
  553. df = pd.read_excel(os.path.dirname(__file__)+"/end.xls")
  554. pattern = "("
  555. for item in df["list_word"]:
  556. item = str(item).replace("(","\(").replace(")","\)").replace(".","\.").replace("[","\[").replace("]","\]").replace("-","\-")
  557. pattern += item+"|"
  558. pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}|##attachment##"
  559. return pattern
  560. pattern_packageScope = getPackageScopePattern()
  561. def getPackagesFromArticle(list_sentence, list_entity):
  562. '''
  563. @param:
  564. list_sentence:文章的句子list
  565. @summary: 将包的信息插入list_entity中
  566. @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息
  567. '''
  568. if len(list_sentence) == 0:
  569. return None
  570. list_sentence.sort(key=lambda x: x.sentence_index)
  571. PackageList = []
  572. PackageList_scope = []
  573. PackageSet = set()
  574. dict_packageCode = dict()
  575. main_body_pack = set() # 2024/04/28 保存正文包号
  576. # package_number_pattern = re.compile(
  577. # '((施工|监理|监测|勘察|设计|劳务)(标段)?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{,4}(标段?|包))|(([a-zA-Z]包[:)]?)?第?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{1,4}标[段包]?)|((标[段号的包项]|([标分子]|合同|项目|采购|()包|包[组件号])[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦA-Za-z]{1,4})|(([,;。、:(]|第)[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}分?包)|([a-zA-Z][0-9]{,3}分?[包标])|.{,1}((包组|包件|包号|分?包|标[段号的包]|子项目)编?号?[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]+)|[,;。、:(]包[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\w]') # 标号
  578. # package_number_pattern = re.compile(
  579. # '((施工|监理|监测|勘察|设计|劳务)(标段)?:?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})?[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
  580. # |(([a-zA-Z]包[:()]?)?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|合同[包段]))\
  581. # |(([,;。、:(]|第)?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
  582. # |((标[段包项]|标段(包)|包[组件标]|[标分子(]包)(\[|【)?:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9}))\
  583. # |[,;。、:(](标的?|项目|子项目?)(\[|【)?:?([一二三四五六七八九十]+|[0-9]{1,9})\
  584. # |((([标分子(]|合同|项目|采购)包|[,。]标的|子项目|[分子]标|标[段包项]|包[组件标]?)编?号[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{1,9})\
  585. # |[,;。、:(]?(合同|分|子)?包:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})')
  586. other_package_pattern = re.compile(
  587. '((项目|物资|设备|场次|标段|标的|产品)(名称)?)[::]([^,。]{2,50}?)[,。]') # # 2020/11/23 大网站规则 调整 package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
  588. win_tenderer_pattern = re.compile('(中标候?选?人|供应商)(名称)?[::](.{2,25})[,。]') # 2020/11/23 大网站规则 调整
  589. model_pattern = re.compile('(型号|序号)[::]([^,。]{2,20})[,。]') # 2020/11/23 大网站规则 调整
  590. number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}")
  591. package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z\(\)]{1,20})")
  592. # 纯数字类型的包号统一,例如:'01','1'
  593. re_digital = re.compile("^\d+$")
  594. def changeIndexFromWordToWords(tokens, word_index):
  595. '''
  596. @summary:转换某个字的字偏移为词偏移
  597. '''
  598. before_index = 0
  599. after_index = 0
  600. for i in range(len(tokens)):
  601. after_index = after_index + len(tokens[i])
  602. if before_index <= word_index and after_index >= word_index:
  603. return i
  604. before_index = after_index
  605. package_names = []
  606. def extractPackageCode(tokens, word_index, size=20, pattern=package_code_pattern):
  607. '''
  608. @summary:抽取包附近的标段号
  609. @param:
  610. tokens:包所在句子的分词
  611. word_index:包所在字偏移
  612. size:左右各取多少个词
  613. pattern:提取标段号的正则
  614. @return: type:string,meaning:标段号
  615. '''
  616. index = changeIndexFromWordToWords(tokens, word_index)
  617. if index < size:
  618. begin = index
  619. else:
  620. begin = index - size
  621. if index + size > len(tokens):
  622. end = len(tokens)
  623. else:
  624. end = index + size
  625. # 拿到左右两边的词语组成短语
  626. text = "".join(tokens[begin:end])
  627. # 在短语中的字偏移
  628. new_word_index = word_index - len("".join(tokens[:begin]))
  629. min_distance = len(text)
  630. packageCode = None
  631. for the_iter in re.finditer(pattern, text):
  632. # 算出最小距离
  633. distance = min([abs(new_word_index - the_iter.span()[0]), abs(new_word_index - the_iter.span()[1])])
  634. if distance < min_distance:
  635. min_distance = distance
  636. packageCode = the_iter.group(1)
  637. return packageCode
  638. def get_package():
  639. PackageList_scope = []
  640. True_package = set()
  641. for i in range(len(list_sentence)):
  642. PackageList_item = []
  643. PackageList_item_scope = []
  644. content = list_sentence[i].sentence_text
  645. # content = content.replace('号,', '号:').replace(':', ':').replace('(', '(').replace(')', ')')
  646. # # .replace('-包',' 包').replace('包-', '包 ').replace('-标', ' 标').replace('标段-', '标段 ').replace('-合同包', ' 合同包') # 72760191 标段:№10
  647. # content = re.sub('[一二三四五六七八九十\d](标[段包项]|包[组件标])编号', ' 标段编号', content)
  648. #
  649. # for it in re.finditer('CA标|(每个?|所有|相关|个|各|不分)[分子]?(标[段包项]?|包[组件标]?|合同包)|(质量|责任)三包|包[/每]|标段(划分|范围)|(承|压缩|软|皮|书|挂)包\
  650. # |标[识注签贴配]|[商油]标号|第X包|第[一二三四五六七八九十]+至[一二三四五六七八九十]+(标[段包项]?|包[组件标]?|合同[包段])\
  651. # |\.(docx|doc|pdf|xlsx|xls|jpg)|[一二三四五]次|五金|\d+[年月]|[\d.,]+万?元|\d+\.\d+', content):
  652. # content = content.replace(it.group(0), ' ' * len(it.group(0)))
  653. # tokens = list_sentence[i].tokens
  654. # _names = []
  655. # for iter in re.finditer(package_number_pattern, content):
  656. # if re.search('(业绩|信誉要求):', content[:iter.start()]): # 前面有业绩或信誉的标段去掉
  657. # continue
  658. # # print('提取到标段:%s, 前后文:%s'%(iter.group(), content[iter.start()-5:iter.end()+5]))
  659. # if re.match('\d', iter.group(0)) and re.search('\d\.$', content[:iter.start()]): # 排除2.10标段3 5.4标段划分 这种情况
  660. # # print('过滤掉错误包:', iter.group())
  661. # continue
  662. # if re.search('[承每书/]包|XX|xx', iter.group(0)) or re.search('\d包[/每]\w|一包[0-9一二三四五六七八九十]+', content[iter.start():iter.end()+3]) or re.search('[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{6,}', iter.group(0)):
  663. # # print('过滤掉错误包:', iter.group())
  664. # continue
  665. # elif iter.end()+2 < len(content) and re.search('标准|标的物|标志|包装|划分|标书', content[iter.start():iter.end()+2]):
  666. # # print('过滤掉错误包:',iter.group())
  667. # continue
  668. # elif re.search('同一(标段?|包)', content[max(0, iter.start()-2):iter.end()]): # 不得参加同一标段
  669. # # print('过滤掉错误包:', iter.group())
  670. # continue
  671. # elif re.search('三包', content[max(0, iter.start()-2):iter.end()]) and re.search('第三包', content[max(0, iter.start()-2):iter.end()])==None: # 规规章和“三包”规定
  672. # # print('过滤掉错误包:', iter.group())
  673. # continue
  674. # elif re.search('[1-9]\d{2,}$|\d{4,}|^[1-9]\d{2,}|合同包[A-Za-z]{2,}', iter.group(0)):
  675. # # print('过滤掉错误包号5:', iter.group(0))
  676. # continue
  677. tokens = list_sentence[i].tokens
  678. _names = []
  679. for iter in find_package(content):
  680. temp_package_number = uniform_package_name(iter.group(0))
  681. True_package.add(temp_package_number)
  682. PackageList_item.append({"name": temp_package_number, "sentence_index": list_sentence[i].sentence_index,
  683. "offsetWords_begin": changeIndexFromWordToWords(tokens, iter.span()[0]),
  684. "offsetWord_begin": iter.span()[0], "offsetWord_end": iter.span()[1]})
  685. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  686. code = extractPackageCode(tokens, iter.span()[0])
  687. if code is not None:
  688. dict_packageCode[temp_package_number] = code
  689. PackageSet.add(temp_package_number)
  690. if not list_sentence[i].in_attachment: # 保存不在附件的包号
  691. main_body_pack.add(temp_package_number)
  692. # 识别packageScope
  693. for iter in re.finditer(pattern_packageScope, content):
  694. PackageList_item_scope.append({"name": "", "sentence_index": list_sentence[i].sentence_index,
  695. "offsetWords_begin": changeIndexFromWordToWords(tokens, iter.span()[0]),
  696. "offsetWord_begin": iter.span()[0], "offsetWord_end": iter.span()[1]})
  697. # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  698. PackageList_item_scope = PackageList_item + PackageList_item_scope
  699. PackageList_item_scope.sort(key=lambda x: x["offsetWord_begin"])
  700. PackageList_scope = PackageList_scope + PackageList_item_scope
  701. PackageList_item.sort(key=lambda x: x["sentence_index"])
  702. return PackageList_scope, True_package
  703. def get_win_project():
  704. '''获取多个项目多个中标人的项目'''
  705. PackageList_scope = []
  706. True_package = set()
  707. # 2020/11/23 大网站规则 调整
  708. if len(PackageSet) == 0 and len(
  709. set([it.entity_text for it in list_entity if
  710. it.entity_type in ['org', 'company'] and it.label == 2])) > 1:
  711. for i in range(len(list_sentence)):
  712. PackageList_item = []
  713. PackageList_item_scope = []
  714. content = list_sentence[i].sentence_text
  715. tokens = list_sentence[i].tokens
  716. names = re.findall(other_package_pattern, content)
  717. N_names = re.findall(win_tenderer_pattern, content)
  718. if len(names) != 1 or len(N_names) != 1:
  719. continue
  720. for iter in re.finditer(other_package_pattern, content):
  721. temp_package_number = iter.group(4)
  722. xinghao = re.search(model_pattern, content)
  723. if xinghao:
  724. temp_package_number = temp_package_number + '+' + xinghao.group(2)
  725. # print('新正则采购包名补充',temp_package_number)
  726. if re.search(re_digital, temp_package_number):
  727. temp_package_number = str(int(temp_package_number))
  728. True_package.add(temp_package_number)
  729. PackageList_item.append(
  730. {"name": temp_package_number, "sentence_index": list_sentence[i].sentence_index,
  731. "offsetWords_begin": changeIndexFromWordToWords(tokens, iter.span()[0]),
  732. "offsetWord_begin": iter.span()[0], "offsetWord_end": iter.span()[1]})
  733. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  734. code = extractPackageCode(tokens, iter.span()[0])
  735. if code is not None:
  736. dict_packageCode[temp_package_number] = code
  737. PackageSet.add(temp_package_number)
  738. if not list_sentence[i].in_attachment: # 保存不在附件的包号
  739. main_body_pack.add(temp_package_number)
  740. # 识别packageScope
  741. for iter in re.finditer(pattern_packageScope, content):
  742. PackageList_item_scope.append({"name": "", "sentence_index": list_sentence[i].sentence_index,
  743. "offsetWords_begin": changeIndexFromWordToWords(tokens,
  744. iter.span()[0]),
  745. "offsetWord_begin": iter.span()[0],
  746. "offsetWord_end": iter.span()[1]})
  747. # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  748. PackageList_item_scope = PackageList_item + PackageList_item_scope
  749. PackageList_item_scope.sort(key=lambda x: x["offsetWord_begin"])
  750. PackageList_scope = PackageList_scope + PackageList_item_scope
  751. PackageList_item.sort(key=lambda x: x["sentence_index"])
  752. return PackageList_scope, True_package
  753. def get_package_scope(PackageList_scope):
  754. PackageList = []
  755. pattern_punctuation = "[::()\(\),,。;;]"
  756. # print("===packageList_scope",PackageList_scope)
  757. for i in range(len(list_sentence)):
  758. for j in range(len(PackageList_scope)):
  759. if i == PackageList_scope[j]["sentence_index"] and PackageList_scope[j]["name"] != "":
  760. _flag = False
  761. left_str = list_sentence[i].sentence_text[
  762. PackageList_scope[j]["offsetWord_begin"] - 30:PackageList_scope[j][
  763. "offsetWord_begin"] + 1]
  764. right_str = list_sentence[i].sentence_text[
  765. PackageList_scope[j]["offsetWord_begin"]:PackageList_scope[j]["offsetWord_begin"] + 30]
  766. _left_find = re.findall(pattern_punctuation, left_str)
  767. _right_find = re.findall(pattern_punctuation, right_str)
  768. # print(left_str)
  769. if re.search("同", left_str[-1:]) is not None and PackageList_scope[j]["name"] == "一":
  770. continue
  771. if re.search("划分", right_str[:10]) is not None:
  772. continue
  773. if len(_left_find) > 0 and _left_find[-1] in [":", ":"]:
  774. _flag = True
  775. if len(_right_find) > 0 and _right_find[0] in [":", ":"]:
  776. _flag = True
  777. if _flag:
  778. scope_begin = [PackageList_scope[j]["sentence_index"],
  779. PackageList_scope[j]["offsetWords_begin"]]
  780. else:
  781. scope_begin = [PackageList_scope[j]["sentence_index"], 0] # 2024/10/10 改为包作用域开始位置为包号所在句子开头
  782. # if j == 0:
  783. # scope_begin = [0, 0]
  784. # else:
  785. # scope_begin = [PackageList_scope[j - 1]["sentence_index"],
  786. # PackageList_scope[j - 1]["offsetWords_begin"]]
  787. if j == len(PackageList_scope) - 1:
  788. scope_end = [list_sentence[-1].sentence_index,
  789. changeIndexFromWordToWords(list_sentence[-1].tokens,
  790. len(list_sentence[
  791. -1].sentence_text))]
  792. else:
  793. scope_end = [PackageList_scope[j + 1]["sentence_index"],
  794. PackageList_scope[j + 1]["offsetWords_begin"]]
  795. if j>0 and PackageList_scope[j - 1]["sentence_index"] == PackageList_scope[j]["sentence_index"] and \
  796. PackageList_scope[j - 1]["offsetWord_begin"] <= PackageList_scope[j]["offsetWord_begin"] and \
  797. PackageList_scope[j - 1]["offsetWord_end"] >= PackageList_scope[j]["offsetWord_end"]:
  798. continue
  799. # add package to entity
  800. _pack_entity = Entity(doc_id=list_sentence[0].doc_id, entity_id="%s_%s_%s_%s" % (
  801. list_sentence[0].doc_id, i, PackageList_scope[j]["offsetWord_begin"],
  802. PackageList_scope[j]["offsetWord_begin"]), entity_text=PackageList_scope[j]["name"],
  803. entity_type="package", sentence_index=PackageList_scope[j]["sentence_index"],
  804. begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,
  805. PackageList_scope[j][
  806. "offsetWord_begin"]),
  807. end_index=changeIndexFromWordToWords(list_sentence[i].tokens,
  808. PackageList_scope[j]["offsetWord_end"]),
  809. wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],
  810. wordOffset_end=PackageList_scope[j]["offsetWord_end"],
  811. in_attachment=list_sentence[i].in_attachment)
  812. list_entity.append(_pack_entity)
  813. copy_pack = copy.copy(PackageList_scope[j])
  814. copy_pack["scope"] = [scope_begin, scope_end]
  815. copy_pack["hit"] = set()
  816. copy_pack["pointer"] = _pack_entity
  817. PackageList.append(copy_pack)
  818. return PackageList
  819. PackageList_scope, True_package = get_package()
  820. # PackageList_scope2, True_package2 = get_win_project() # 20240508 与表格提取重复,去掉
  821. # if len(True_package2) > 2: # 同时包含多标段及多中标人的
  822. # PackageList_scope = PackageList_scope + PackageList_scope2
  823. PackageList = get_package_scope(PackageList_scope)
  824. # if len(PackageSet)<2: # 20230922只提取到一个包号的去掉,都放在默认包project 2024/02/02 注释掉,防止多标段每篇公告只公布一个标段的没法提取标段号
  825. # return [], set(), {}
  826. return PackageList, PackageSet, dict_packageCode, main_body_pack
  827. # km配对方法
  828. def dispatch(match_list):
  829. main_roles = list(set([match.main_role for match in match_list]))
  830. # print('main_roles',[i.entity_text for i in main_roles])
  831. attributes = list(set([match.attribute for match in match_list]))
  832. # try:
  833. # print('attributes',[i.entity_text for i in attributes])
  834. # except:
  835. # pass
  836. label = np.zeros(shape=(len(main_roles), len(attributes)))
  837. for match in match_list:
  838. main_role = match.main_role
  839. attribute = match.attribute
  840. value = match.value
  841. label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000
  842. # print(label)
  843. gragh = -label
  844. # km算法
  845. row, col = linear_sum_assignment(gragh)
  846. max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value]
  847. # return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch]
  848. return [(main_roles[row], attributes[col]) for row, col in max_dispatch]
  849. from BiddingKG.dl.common.Utils import getUnifyMoney
  850. from BiddingKG.dl.interface.modelFactory import Model_relation_extraction
  851. relationExtraction_model = Model_relation_extraction()
  852. def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_sentence,list_entity,list_outline,winter_scope,on_value = 0.5,on_value_person=0.5,sentence_len=4):
  853. '''
  854. @param:
  855. PackDict:文章包dict
  856. roleSet:文章所有角色的公司名称
  857. PackageList:文章的包信息
  858. PackageSet:文章所有包的名称
  859. list_entity:文章所有经过模型处理的实体
  860. on_value:金额模型的阈值
  861. on_value_person:联系人模型的阈值
  862. sentence_len:公司和属性间隔句子的最大长度
  863. @return:添加了属性信息的角色list
  864. '''
  865. #根据roleid添加金额到rolelist中
  866. def addMoneyByRoleid(packDict,packageName,roleid,money,money_prob):
  867. for i in range(len(packDict[packageName]["roleList"])):
  868. if packDict[packageName]["roleList"][i].role_name==dict_role_id.get(str(roleid)):
  869. if money_prob>packDict[packageName]["roleList"][i].money_prob:
  870. packDict[packageName]["roleList"][i].money = money
  871. packDict[packageName]["roleList"][i].money_prob = money_prob
  872. return packDict
  873. #根据实体名称添加金额到rolelist中
  874. def addMoneyByEntity(packDict,packageName,entity,money,money_prob):
  875. for i in range(len(packDict[packageName]["roleList"])):
  876. if packDict[packageName]["roleList"][i].entity_text==entity:
  877. # if money_prob>packDict[packageName]["roleList"][i].money_prob:
  878. # packDict[packageName]["roleList"][i].money = money
  879. # packDict[packageName]["roleList"][i].money_prob = money_prob
  880. if packDict[packageName]["roleList"][i].money_prob==0 : # 2021/7/20第一次更新金额
  881. if money.notes == '单价':
  882. packDict[packageName]["roleList"][i].unit_price = money.entity_text
  883. else:
  884. packDict[packageName]["roleList"][i].money = money.entity_text
  885. packDict[packageName]["roleList"][i].money_prob = money_prob
  886. packDict[packageName]["roleList"][i].money_unit = money.money_unit
  887. elif money_prob>packDict[packageName]["roleList"][i].money_prob+0.2 or (money.notes in ['大写'] and money.in_attachment==False): # 2021/7/20改为优先选择大写金额,
  888. # print('已连接金额概率:money_prob:',packDict[packageName]["roleList"][i].money_prob)
  889. # print('链接金额备注 ',money.notes, money.entity_text, money.values)
  890. if money.notes == '单价':
  891. packDict[packageName]["roleList"][i].unit_price = money.entity_text
  892. else:
  893. packDict[packageName]["roleList"][i].money = money.entity_text
  894. packDict[packageName]["roleList"][i].money_prob = money_prob
  895. packDict[packageName]["roleList"][i].money_unit = money.money_unit
  896. # print('链接中的金额:{0}, 单位:{1}'.format(money.entity_text, money.money_unit))
  897. return packDict
  898. def addRatioByEntity(packDict,packageName,entity,ratio):
  899. for i in range(len(packDict[packageName]["roleList"])):
  900. if packDict[packageName]["roleList"][i].entity_text==entity:
  901. packDict[packageName]["roleList"][i].ratio = ratio.ratio_value
  902. def addServiceTimeByEntity(packDict,packageName,entity,serviceTime):
  903. for i in range(len(packDict[packageName]["roleList"])):
  904. if packDict[packageName]["roleList"][i].entity_text==entity and not packDict[packageName]["roleList"][i].serviceTime:
  905. # packDict[packageName]["roleList"][i].serviceTime = serviceTime.entity_text
  906. packDict[packageName]["roleList"][i].serviceTime = extract_serviceTime(serviceTime.entity_text,"")
  907. #根据实体名称得到角色
  908. def getRoleWithText(packDict,entity_text):
  909. for pack in packDict.keys():
  910. for i in range(len(packDict[pack]["roleList"])):
  911. if packDict[pack]["roleList"][i].entity_text==entity_text:
  912. return packDict[pack]["roleList"][i].role_name
  913. def doesEntityOrLinkedEntity_inRoleSet(entity,RoleSet):
  914. _list_entitys = [entity]+entity.linked_entitys
  915. for _entity in _list_entitys:
  916. if _entity.entity_text in RoleSet:
  917. return True
  918. p_entity = 0
  919. # 2021/7/19 顺序比较金额,前面是后面的一万倍则把前面金额/10000
  920. # money_list = [it for it in list_entity if it.entity_type=="money"]
  921. # for i in range(len(money_list)-1):
  922. # for j in range(1, len(money_list)):
  923. # if (float(money_list[i].entity_text) > 5000000000 or money_list[j].notes=='大写') and \
  924. # Decimal(money_list[i].entity_text)/Decimal(money_list[j].entity_text)==10000:
  925. # money_list[i].entity_text = str(Decimal(money_list[i].entity_text)/10000)
  926. # # print('连接前修改大于50亿金额:前面是后面的一万倍则把前面金额/10000')
  927. '''同样金额同时有元及万元单位的,把万元的金额改为元'''
  928. wanyuan = []
  929. yuan = []
  930. for it in list_entity:
  931. if it.entity_type == "money" and float(it.entity_text)>1000000: # 20240523 修改为百万以上金额才对比万倍关系,其他又行业限额纠正避免有些万元单位提取不到从而被除一万 例:52435607 最高限价(万元):22679.32 蜀冈招标控制价22679.32工程地点南路西侧(万元)
  932. if it.money_unit == '万元' or float(it.entity_text)>5000000000:
  933. wanyuan.append(it)
  934. if it.money_unit == '元' or float(it.entity_text)<5000000:
  935. yuan.append(it)
  936. if wanyuan != [] and yuan != []:
  937. for m1 in wanyuan:
  938. for m2 in yuan:
  939. if Decimal(m1.entity_text)/Decimal(m2.entity_text) == 10000:
  940. m1.entity_text = m2.entity_text
  941. #遍历所有实体
  942. # while(p_entity<len(list_entity)):
  943. # entity = list_entity[p_entity]
  944. '''
  945. #招标金额从后往前找
  946. if entity.entity_type=="money":
  947. if entity.values[entity.label]>=on_value:
  948. if str(entity.label)=="0":
  949. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
  950. if packagePointer is None:
  951. packageName = "Project"
  952. else:
  953. packageName = packagePointer.entity_text
  954. addMoneyByRoleid(PackDict, packageName, "0", entity.entity_text, entity.values[entity.label])
  955. '''
  956. ''' # 2020/11/25 与下面的联系人连接步骤重复,取消
  957. if entity.entity_type=="person":
  958. if entity.values[entity.label]>=on_value_person:
  959. if str(entity.label)=="1":
  960. for i in range(len(PackDict["Project"]["roleList"])):
  961. if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  962. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  963. # add pointer_person
  964. for _entity in list_entity:
  965. if dict_role_id.get(str(_entity.label))=="tenderee":
  966. for i in range(len(PackDict["Project"]["roleList"])):
  967. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
  968. _entity.pointer_person = entity
  969. elif str(entity.label)=="2":
  970. for i in range(len(PackDict["Project"]["roleList"])):
  971. if PackDict["Project"]["roleList"][i].role_name=="agency":
  972. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  973. # add pointer_person
  974. for _entity in list_entity:
  975. if dict_role_id.get(str(_entity.label))=="agency":
  976. for i in range(len(PackDict["Project"]["roleList"])):
  977. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
  978. _entity.pointer_person = entity
  979. '''
  980. # #金额往前找实体
  981. # if entity.entity_type=="money":
  982. # if entity.values[entity.label]>=on_value:
  983. # p_entity_money= p_entity
  984. # entity_money = list_entity[p_entity_money]
  985. # if len(PackageSet)>0:
  986. # packagePointer,_ = getPackage(PackageList,entity_money.sentence_index,entity_money.begin_index,"money-"+str(entity_money.entity_text)+"-"+str(entity_money.label))
  987. # if packagePointer is None:
  988. # packageName_entity = "Project"
  989. # else:
  990. # packageName_entity = packagePointer.entity_text
  991. # else:
  992. # packageName_entity = "Project"
  993. # while(p_entity_money>0):
  994. # entity_before = list_entity[p_entity_money]
  995. # if entity_before.entity_type in ['org','company']:
  996. # if str(entity_before.label)=="1":
  997. # addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label])
  998. # #add pointer_money
  999. # entity_before.pointer_money = entity_money
  1000. # break
  1001. # p_entity_money -= 1
  1002. #如果实体属于角色集合,则往后找属性
  1003. # if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet):
  1004. #
  1005. # p_entity += 1
  1006. # #循环查找符合的属性
  1007. # while(p_entity<len(list_entity)):
  1008. #
  1009. # entity_after = list_entity[p_entity]
  1010. # if entity_after.sentence_index-entity.sentence_index>=sentence_len:
  1011. # p_entity -= 1
  1012. # break
  1013. # #若是遇到公司实体,则跳出循环
  1014. # if entity_after.entity_type in ['org','company']:
  1015. # p_entity -= 1
  1016. # break
  1017. # if entity_after.values is not None:
  1018. # if entity_after.entity_type=="money":
  1019. # if entity_after.values[entity_after.label]>=on_value:
  1020. # '''
  1021. # #招标金额从后往前找
  1022. # if str(entity_after.label)=="0":
  1023. # packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
  1024. # if packagePointer is None:
  1025. # packageName = "Project"
  1026. # else:
  1027. # packageName = packagePointer.entity_text
  1028. # addMoneyByRoleid(PackDict, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label])
  1029. # '''
  1030. # if str(entity_after.label)=="1":
  1031. # #print(entity_after.entity_text,entity.entity_text)
  1032. # _list_entitys = [entity]+entity.linked_entitys
  1033. # if len(PackageSet)>0:
  1034. # packagePointer,_ = getPackage(PackageList,entity_after.sentence_index,entity_after.begin_index,"money-"+str(entity_after.entity_text)+"-"+str(entity_after.label))
  1035. # if packagePointer is None:
  1036. # packageName_entity = "Project"
  1037. # else:
  1038. # packageName_entity = packagePointer.entity_text
  1039. # else:
  1040. # packageName_entity = "Project"
  1041. # if str(entity.label) in ["2","3","4"]:
  1042. # # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
  1043. # if entity_after.notes == '单价' or float(entity_after.entity_text)<5000: #2021/12/17 调整小金额阈值,避免203608823.html 两次金额一次万元没提取到的情况
  1044. # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
  1045. # 0.5)
  1046. # entity.pointer_money = entity_after
  1047. # # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
  1048. # else:
  1049. # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
  1050. # entity_after.values[entity_after.label])
  1051. # entity.pointer_money = entity_after
  1052. # # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
  1053. # if entity_after.values[entity_after.label]>0.6:
  1054. # break # 2021/7/16 新增,找到中标金额,非单价即停止,不再往后找金额
  1055. # #add pointer_money
  1056. # # entity.pointer_money = entity_after
  1057. # # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
  1058. # # if entity_after.notes!='单价':
  1059. # # break # 2021/7/16 新增,找到中标金额即停止,不再往后找金额
  1060. # '''
  1061. # if entity_after.entity_type=="person":
  1062. # if entity_after.values[entity_after.label]>=on_value_person:
  1063. # if str(entity_after.label)=="1":
  1064. # for i in range(len(roleList)):
  1065. # if roleList[i].role_name=="tenderee":
  1066. # roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  1067. # elif str(entity_after.label)=="2":
  1068. # for i in range(len(roleList)):
  1069. # if roleList[i].role_name=="agency":
  1070. # roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  1071. # elif str(entity_after.label)=="3":
  1072. # _list_entitys = [entity]+entity.linked_entitys
  1073. # for _entity in _list_entitys:
  1074. # for i in range(len(roleList)):
  1075. # if roleList[i].entity_text==_entity.entity_text:
  1076. # if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0:
  1077. # break
  1078. # roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  1079. # '''
  1080. #
  1081. # p_entity += 1
  1082. #
  1083. # p_entity += 1
  1084. # 记录每句的分词数量
  1085. tokens_num_dict = dict()
  1086. last_tokens_num = 0
  1087. for sentence in list_sentence:
  1088. _index = sentence.sentence_index
  1089. if _index == 0:
  1090. tokens_num_dict[_index] = 0
  1091. else:
  1092. tokens_num_dict[_index] = tokens_num_dict[_index - 1] + last_tokens_num
  1093. last_tokens_num = len(sentence.tokens)
  1094. attribute_type = ['money','serviceTime','ratio']# 'money'仅指“中投标金额”
  1095. for link_attribute in attribute_type:
  1096. temp_entity_list = []
  1097. if link_attribute=="money":
  1098. temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or
  1099. (ent.entity_type=='money' and ent.label==1 and ent.values[ent.label]>=0.5)]
  1100. # 删除重复的‘中投标金额’,一般为大小写两种样式
  1101. drop_tendererMoney = []
  1102. for ent_idx in range(len(temp_entity_list)-1):
  1103. entity = temp_entity_list[ent_idx]
  1104. if entity.entity_type=='money':
  1105. next_entity = temp_entity_list[ent_idx+1]
  1106. if next_entity.entity_type=='money':
  1107. if getUnifyMoney(entity.entity_text)==getUnifyMoney(next_entity.entity_text):
  1108. if (tokens_num_dict[next_entity.sentence_index] + next_entity.begin_index) - (
  1109. tokens_num_dict[entity.sentence_index] + entity.end_index) < 10:
  1110. drop_tendererMoney.append(next_entity)
  1111. for _drop in drop_tendererMoney:
  1112. temp_entity_list.remove(_drop)
  1113. elif link_attribute=="serviceTime":
  1114. temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or
  1115. ent.entity_type=='serviceTime']
  1116. elif link_attribute=="ratio":
  1117. temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or
  1118. ent.entity_type=='ratio']
  1119. temp_entity_list = sorted(temp_entity_list,key=lambda x: (x.sentence_index, x.begin_index))
  1120. temp_match_list = []
  1121. for ent_idx in range(len(temp_entity_list)):
  1122. entity = temp_entity_list[ent_idx]
  1123. if entity.entity_type in ['org','company']:
  1124. match_nums = 0
  1125. tenderer_nums = 0 #经过其他中投标人的数量
  1126. byNotTenderer_match_nums = 0 #跟在中投标人后面的属性
  1127. for after_index in range(ent_idx + 1, min(len(temp_entity_list), ent_idx + 4)):
  1128. after_entity = temp_entity_list[after_index]
  1129. if entity.in_attachment != after_entity.in_attachment: # 正文与附件的不能相连
  1130. break
  1131. if after_entity.entity_type == link_attribute:
  1132. distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
  1133. tokens_num_dict[entity.sentence_index] + entity.end_index)
  1134. sentence_distance = after_entity.sentence_index - entity.sentence_index
  1135. value = (-1 / 2 * (distance ** 2)) / 10000
  1136. if link_attribute == "money":
  1137. if after_entity.notes == '单价':
  1138. value = value * 100
  1139. if sentence_distance == 0:
  1140. if distance < 100:
  1141. # value = (-1 / 2 * (distance ** 2)) / 10000
  1142. temp_match_list.append(Match(entity, after_entity, value))
  1143. match_nums += 1
  1144. if not tenderer_nums:
  1145. byNotTenderer_match_nums += 1
  1146. else:
  1147. break
  1148. else:
  1149. if distance < 60:
  1150. # value = (-1 / 2 * (distance ** 2)) / 10000
  1151. temp_match_list.append(Match(entity, after_entity, value))
  1152. match_nums += 1
  1153. if not tenderer_nums:
  1154. byNotTenderer_match_nums += 1
  1155. else:
  1156. break
  1157. else:
  1158. tenderer_nums += 1
  1159. #前向查找属性
  1160. if ent_idx!=0 and (not match_nums or not byNotTenderer_match_nums):
  1161. previous_entity = temp_entity_list[ent_idx - 1]
  1162. if previous_entity.entity_type == link_attribute:
  1163. # if previous_entity.sentence_index == entity.sentence_index:
  1164. distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
  1165. tokens_num_dict[previous_entity.sentence_index] + previous_entity.end_index)
  1166. if distance < 40:
  1167. # 前向 没有 /10000
  1168. value = (-1 / 2 * (distance ** 2))
  1169. temp_match_list.append(Match(entity, previous_entity, value))
  1170. # km算法分配求解
  1171. dispatch_result = dispatch(temp_match_list)
  1172. dispatch_result = sorted(dispatch_result, key=lambda x: (x[0].sentence_index,x[0].begin_index))
  1173. for match in dispatch_result:
  1174. _entity = match[0]
  1175. _attribute = match[1]
  1176. if link_attribute=='money':
  1177. _entity.pointer_money = _attribute
  1178. packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index,
  1179. "money-" + str(_attribute.entity_text) + "-" + str(_attribute.label))
  1180. # print(_entity.entity_text,_attribute.entity_text)
  1181. if packagePointer is None:
  1182. packageName_entity = "Project"
  1183. else:
  1184. packageName_entity = packagePointer.entity_text
  1185. if _attribute.notes == '单价' or float(_attribute.entity_text) < 5000: # 2021/12/17 调整小金额阈值,避免203608823.html 两次金额一次万元没提取到的情况
  1186. # print(packageName_entity,_attribute.entity_text, _attribute.values[_attribute.label])
  1187. addMoneyByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute,0.5)
  1188. else:
  1189. # print(packageName_entity,_attribute.entity_text, _attribute.values[_attribute.label])
  1190. addMoneyByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute,
  1191. _attribute.values[_attribute.label])
  1192. elif link_attribute=='serviceTime':
  1193. _entity.pointer_serviceTime = _attribute
  1194. packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index,
  1195. "serviceTime-" + str(_attribute.entity_text) + "-" + str(_attribute.label))
  1196. if packagePointer is None:
  1197. packageName_entity = "Project"
  1198. else:
  1199. packageName_entity = packagePointer.entity_text
  1200. addServiceTimeByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute)
  1201. elif link_attribute=='ratio':
  1202. _entity.pointer_ratio = _attribute
  1203. packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index,
  1204. "ratio-" + str(_attribute.entity_text) + "-" + str(_attribute.label))
  1205. if packagePointer is None:
  1206. packageName_entity = "Project"
  1207. else:
  1208. packageName_entity = packagePointer.entity_text
  1209. addRatioByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute)
  1210. ''''''
  1211. # 通过模型分类的招标/代理联系人
  1212. list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index)
  1213. person_list = [entity for entity in list_entity if entity.entity_type == 'person' and entity.label in [1, 2]]
  1214. tenderee_contact = set()
  1215. tenderee_phone = set()
  1216. agency_contact = set()
  1217. agency_phone = set()
  1218. winter_contact = set()
  1219. rule_winter_phone = set()
  1220. for _person in person_list:
  1221. if _person.label == 1:
  1222. tenderee_contact.add(_person.entity_text)
  1223. if _person.label == 2:
  1224. agency_contact.add(_person.entity_text)
  1225. # 正则匹配无 '主体/联系人' 的电话
  1226. # 例:"采购人联系方式:0833-5226788,"
  1227. phone_pattern = '(1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' \
  1228. '\+86.?1[3-9]\d{9}|' \
  1229. '0[1-9]\d{1,2}[-—-―][2-9]\d{6,7}/[1-9]\d{6,10}|' \
  1230. '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?.?转\d{1,4}|' \
  1231. '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?[-—-―]\d{1,4}|' \
  1232. '0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?(?=1[3-9]\d{9})|' \
  1233. '0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?)|' \
  1234. '0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?(?=[2-9]\d{6,7})|' \
  1235. '0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?|' \
  1236. '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6}\d?-?\d{,4}|' \
  1237. '[2-9]\d{6,7})'
  1238. re_tenderee_phone = re.compile(
  1239. # "(?:(?:(?:采购|招标|议价|议标|比选|业主|委托)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)(?:单位)?[^。代理]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
  1240. "(?:(?:(?:遴选|寻源|采购|招标|竞价|议价|比选|(?:[^受被]|^)委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求?|甲方?|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包|最终|建设|业主|竞卖|申购|公选)"
  1241. "(?:人|方|商|单位|组织|用户|业主|主体|部门|公司|企业))(?:单位)?[^。代理]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
  1242. # 电话号码
  1243. + phone_pattern)
  1244. # 例:"采购人地址和联系方式:峨边彝族自治县教育局,0833-5226788,"
  1245. re_tenderee_phone2 = re.compile(
  1246. # "(?:(?:(?:采购|招标|议价|议标|比选|业主)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)(?:单位)?[^。代理]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)"
  1247. "(?:(?:(?:遴选|寻源|采购|招标|竞价|议价|比选|(?:[^受被]|^)委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求?|甲方?|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包|最终|建设|业主|竞卖|申购|公选)"
  1248. "(?:人|方|商|单位|组织|用户|业主|主体|部门|公司|企业))(?:单位)?[^。代理]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话|联系人和联系方式)[::]?[^。]{0,20}?)"
  1249. # 电话号码
  1250. + phone_pattern)
  1251. re_agent_phone = re.compile(
  1252. "(?:(?:(?:代理|[受被]委托)(?:人|方|商|机构|公司|单位|组织|企业)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,5}(?:电话|联系方式|联系人|联系电话|联系人和联系方式)[::]?[^。]{0,7}?)"
  1253. # 电话号码
  1254. + phone_pattern)
  1255. re_agent_phone2 = re.compile(
  1256. "(?:(?:(?:代理|[受被]委托)(?:人|方|商|机构|公司|单位|组织|企业)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话|联系人和联系方式)[::]?[^。]{0,20}?)"
  1257. # 电话号码
  1258. + phone_pattern)
  1259. re_win_tenderer_phone = re.compile(
  1260. "(?:(?:(?:乙|竞得|受让|买受|签约|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租(?:(包))?|入围|入选|竞买|中标|中选|中价|中签|成交|候选)"
  1261. "(?:候选|投标)?(?:人|单位|(?:中介)?(?:服务)?机构|供应商|客户|方|公司|企业|厂商|商|社会资本方?)|选定单位|中[标选]银行|成交对象)[^。审核]{0,5}(?:负责人|联系人|项目)?(?:经理|电话|联系方式|联系人|负责人|联系电话|联系人和联系方式)[::]?[^。]{0,7}?)"
  1262. + phone_pattern)
  1263. re_win_tenderer_phone2 = re.compile(
  1264. "(?:(?:(?:乙|竞得|受让|买受|签约|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租(?:(包))?|入围|入选|竞买|中标|中选|中价|中签|成交|候选)"
  1265. "(?:候选|投标)?(?:人|单位|(?:中介)?(?:服务)?机构|供应商|客户|方|公司|企业|厂商|商|社会资本方?)|选定单位|中[标选]银行|成交对象)[^。]{0,3}(?:地址)[^。审核]{0,3}(?:负责人|联系人|项目)?(?:经理|电话|联系方式|联系人|负责人|联系电话|联系人和联系方式)[::]?[^。]{0,20}?)"
  1266. + phone_pattern)
  1267. not_win_tenderer_contact = re.compile("纪检|监察|质疑|投诉|监督|受理|请.{0,4}(联系|与)"
  1268. "|(遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求?|甲方?|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包|最终|建设|业主|竞卖|申购|公选|发布|代理|拍卖|转出){1,2}"
  1269. "(人|方|商|单位|组织|用户|业主|主体|部门|公司|企业|工厂|银行|机构){0,2}"
  1270. "[\u4e00-\u9fa5]{0,4}(联系|咨询|电话)(人|电话|方式)?")
  1271. content = ""
  1272. for _sentence in list_sentence:
  1273. content += "".join(_sentence.tokens)
  1274. _content = copy.deepcopy(content)
  1275. while re.search("(.)(,)([^0-9])|([^0-9])(,)(.)", content):
  1276. content_words = list(content)
  1277. for i in re.finditer("(.)(,)([^0-9])", content):
  1278. content_words[i.span(2)[0]] = ""
  1279. for i in re.finditer("([^0-9])(,)(.)", content):
  1280. content_words[i.span(2)[0]] = ""
  1281. content = "".join(content_words)
  1282. content = re.sub("[::]|[\((]|[\))]", "", content)
  1283. _tenderee_phone = re.findall(re_tenderee_phone, content)
  1284. # 更新正则确定的角色属性
  1285. for i in range(len(PackDict["Project"]["roleList"])):
  1286. if PackDict["Project"]["roleList"][i].role_name == "tenderee":
  1287. _tenderee_phone = re.findall(re_tenderee_phone, content)
  1288. if _tenderee_phone:
  1289. for _phone in _tenderee_phone:
  1290. _phone = _phone.split("/") # 分割多个号码
  1291. for one_phone in _phone:
  1292. PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
  1293. tenderee_phone.add(one_phone)
  1294. _tenderee_phone2 = re.findall(re_tenderee_phone2, content)
  1295. if _tenderee_phone2:
  1296. for _phone in _tenderee_phone2:
  1297. _phone = _phone.split("/")
  1298. for one_phone in _phone:
  1299. PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
  1300. tenderee_phone.add(one_phone)
  1301. if PackDict["Project"]["roleList"][i].role_name == "agency":
  1302. _agent_phone = re.findall(re_agent_phone, content)
  1303. if _agent_phone:
  1304. for _phone in _agent_phone:
  1305. _phone = _phone.split("/")
  1306. for one_phone in _phone:
  1307. PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
  1308. agency_phone.add(one_phone)
  1309. _agent_phone2 = re.findall(re_agent_phone2, content)
  1310. if _agent_phone2:
  1311. for _phone in _agent_phone2:
  1312. _phone = _phone.split("/")
  1313. for one_phone in _phone:
  1314. PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
  1315. agency_phone.add(one_phone)
  1316. # 中标人联系方式规则筛选
  1317. _winter_phone = re.findall(re_win_tenderer_phone, content)
  1318. if _winter_phone:
  1319. for _phone in _winter_phone:
  1320. _phone = _phone.split("/")
  1321. for one_phone in _phone:
  1322. rule_winter_phone.add(one_phone)
  1323. _winter_phone2 = re.findall(re_win_tenderer_phone2, content)
  1324. if _winter_phone2:
  1325. for _phone in _winter_phone2:
  1326. _phone = _phone.split("/")
  1327. for one_phone in _phone:
  1328. rule_winter_phone.add(one_phone)
  1329. # 正则提取电话号码实体
  1330. # key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
  1331. phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
  1332. '\+86.?1[3-9]\d{9}|'
  1333. # '0[^0]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
  1334. '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?[-—-―]\d{1,4}|'
  1335. '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=1[3-9]\d{9})|'
  1336. '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?)|'
  1337. '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=[2-9]\d{6,7})|'
  1338. '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?|'
  1339. '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6}\d?-?\d{,4}|'
  1340. '400\d{7}转\d{1,4}|'
  1341. '[2-9]\d{6,7}')
  1342. url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[#$\-_@.&+=\?:/]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
  1343. email_pattern = re.compile("[a-zA-Z0-9][a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*@"
  1344. "[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*(?:\.[a-zA-Z]{2,})")
  1345. phone_entitys = []
  1346. code_entitys = [ent for ent in list_entity if ent.entity_type=='code']
  1347. for _sentence in list_sentence:
  1348. sentence_text = _sentence.sentence_text
  1349. # 过长数字串直接过滤替换
  1350. for _re in re.findall("\d{50,}",sentence_text):
  1351. sentence_text = sentence_text.replace(_re,"#"*len(_re))
  1352. in_attachment = _sentence.in_attachment
  1353. list_tokenbegin = []
  1354. begin = 0
  1355. for i in range(0, len(_sentence.tokens)):
  1356. list_tokenbegin.append(begin)
  1357. begin += len(str(_sentence.tokens[i]))
  1358. list_tokenbegin.append(begin + 1)
  1359. # 排除网址、邮箱、项目编号实体
  1360. error_list = []
  1361. for i in re.finditer(url_pattern, sentence_text):
  1362. error_list.append((i.start(), i.end()))
  1363. for i in re.finditer(email_pattern, sentence_text):
  1364. error_list.append((i.start(), i.end()))
  1365. for code_ent in [ent for ent in code_entitys if ent.sentence_index==_sentence.sentence_index]:
  1366. error_list.append((code_ent.wordOffset_begin,code_ent.wordOffset_end))
  1367. res_set = set()
  1368. for i in re.finditer(phone, sentence_text):
  1369. is_continue = False
  1370. for error_ent in error_list:
  1371. if i.start()>=error_ent[0] and i.end()<=error_ent[1]:
  1372. is_continue = True
  1373. break
  1374. if is_continue:
  1375. continue
  1376. res_set.add((i.group(), i.start(), i.end()))
  1377. res_set = sorted(list(res_set),key=lambda x:x[1])
  1378. # 限制数量,防止异常数据处理时间过长
  1379. res_set = res_set[:200]
  1380. last_phone_mask = True
  1381. error_numStr_index = []
  1382. sentence_phone_list = []
  1383. for item_idx in range(len(res_set)):
  1384. item = res_set[item_idx]
  1385. phone_left = sentence_text[max(0, item[1] - 10):item[1]]
  1386. phone_right = sentence_text[item[2]:item[2] + 10]
  1387. phone_left_num = re.search("[\da-zA-Z\-—-―]+$",phone_left)
  1388. numStr_left = item[1]
  1389. if phone_left_num:
  1390. numStr_left -= len(phone_left_num.group())
  1391. phone_right_num = re.search("^[\da-zA-Z\-—-―]+",phone_right)
  1392. numStr_right = item[2]
  1393. if phone_right_num:
  1394. numStr_right += len(phone_right_num.group())
  1395. numStr_index = (numStr_left,numStr_right)
  1396. if re.search("电话|手机|联系[人方]|联系方式",re.sub(",","",phone_left)):
  1397. pass
  1398. else:
  1399. # 排除“传真号”和其它错误项
  1400. if re.search("传,?真|信,?箱|邮,?[编箱件]|QQ|qq", phone_left):
  1401. if not re.search("电,?话", phone_left):
  1402. error_numStr_index.append(numStr_index)
  1403. last_phone_mask = False
  1404. continue
  1405. if re.search("身份证号?码?|注册[证号]|帐号|编[号码]|报价|费率|标价|证号|资格证|资质|价格|金额|型号|附件|代码|列号|行号|税号|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", re.sub(",","",phone_left)):
  1406. error_numStr_index.append(numStr_index)
  1407. last_phone_mask = False
  1408. continue
  1409. if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+|元", phone_right):
  1410. error_numStr_index.append(numStr_index)
  1411. last_phone_mask = False
  1412. continue
  1413. # 号码含有0过多,不符合规则
  1414. if re.search("0{6,}",item[0]):
  1415. error_numStr_index.append(numStr_index)
  1416. last_phone_mask = False
  1417. continue
  1418. # 前后跟着字母
  1419. if re.search("[a-zA-Z/]+$", phone_left) or re.search("^[a-zA-Z/]+", phone_right):
  1420. error_numStr_index.append(numStr_index)
  1421. last_phone_mask = False
  1422. continue
  1423. # 时间日期类排除
  1424. if re.search("时间|日期", phone_left):
  1425. error_numStr_index.append(numStr_index)
  1426. last_phone_mask = False
  1427. continue
  1428. # 排除号码实体为时间格式 ,例如:20150515
  1429. if re.search("^20(1[0-9]|2[0-5])(0[1-9]|1[012])(0[1-9]|[1-2][0-9]|3[01])$",item[0]):
  1430. error_numStr_index.append(numStr_index)
  1431. last_phone_mask = False
  1432. continue
  1433. # 前后跟着长度小于一定值数字的正则排除
  1434. if re.search("\d+[-—-―]?\d*$",phone_left) or re.search("^\d+[-—-―]?\d*",phone_right):
  1435. phone_left_number = re.search("\d+[-—-―]?\d*$",phone_left)
  1436. phone_right_number = re.search("^\d+[-—-―]?\d+",phone_right)
  1437. if phone_left_number:
  1438. if len(phone_left_number.group())<7:
  1439. error_numStr_index.append(numStr_index)
  1440. last_phone_mask = False
  1441. continue
  1442. if phone_right_number:
  1443. if len(phone_right_number.group())<7:
  1444. error_numStr_index.append(numStr_index)
  1445. last_phone_mask = False
  1446. continue
  1447. left_context = re.search("[\da-zA-Z\-—-―]+$",sentence_text[:item[1]])
  1448. if left_context:
  1449. if len(left_context.group()) != len("".join(re.findall(phone, left_context.group()))):
  1450. # if not re.search("(" + phone.pattern + ")$", left_context.group()):
  1451. error_numStr_index.append(numStr_index)
  1452. last_phone_mask = False
  1453. continue
  1454. right_context = re.search("^[\da-zA-Z\-—-―]+", sentence_text[item[2]:])
  1455. if right_context:
  1456. if len(right_context.group()) != len("".join(re.findall(phone, right_context.group()))):
  1457. # if not re.search("^(" + phone.pattern + ")", right_context.group()):
  1458. error_numStr_index.append(numStr_index)
  1459. last_phone_mask = False
  1460. continue
  1461. # if:上一个phone实体不符合条件
  1462. if not last_phone_mask:
  1463. item_start = item[1]
  1464. last_item_end = res_set[item_idx-1][2]
  1465. if item_start - last_item_end<=1 or re.search("^[\da-zA-Z\-—-―、]+$",sentence_text[last_item_end:item_start]):
  1466. error_numStr_index.append(numStr_index)
  1467. last_phone_mask = False
  1468. continue
  1469. sentence_phone_list.append(item)
  1470. last_phone_mask = True
  1471. if error_numStr_index:
  1472. drop_list = []
  1473. for item in sentence_phone_list:
  1474. for err_index in error_numStr_index:
  1475. if (item[1]>=err_index[0] and item[1]<=err_index[1]) or (item[2]>=err_index[0] and item[2]<=err_index[1]) or (item[1]<=err_index[0] and item[2]>=err_index[1]):
  1476. drop_list.append(item)
  1477. break
  1478. for _drop_item in drop_list:
  1479. sentence_phone_list.remove(_drop_item)
  1480. for item in sentence_phone_list:
  1481. for j in range(len(list_tokenbegin)):
  1482. if list_tokenbegin[j] == item[1]:
  1483. begin_index = j
  1484. break
  1485. elif list_tokenbegin[j] > item[1]:
  1486. begin_index = j - 1
  1487. break
  1488. for j in range(begin_index, len(list_tokenbegin)):
  1489. if list_tokenbegin[j] >= item[2]:
  1490. end_index = j - 1
  1491. break
  1492. phone_text = re.sub("[-—-―]+","-",item[0]).replace("(","(").replace(")",")")
  1493. _entity = Entity(_sentence.doc_id, None, phone_text, "phone", _sentence.sentence_index, begin_index, end_index, item[1],
  1494. item[2],in_attachment=in_attachment)
  1495. phone_entitys.append(_entity)
  1496. # print('phone_set:',set([ent.entity_text for ent in phone_entitys]))
  1497. def is_company(entity,text):
  1498. # 判断"公司"实体是否为地址地点
  1499. if entity.label!=5 and entity.values[entity.label]>0.5:
  1500. return True
  1501. if ent.is_tail==True:
  1502. return False
  1503. entity_left = text[max(0,entity.wordOffset_begin-10):entity.wordOffset_begin]
  1504. entity_left = re.sub(",()\(\)","",entity_left)
  1505. entity_left = entity_left[-5:]
  1506. if re.search("地址|地点|银行[::]",entity_left):
  1507. return False
  1508. else:
  1509. return True
  1510. pre_entity = []
  1511. for ent in list_entity:
  1512. if (ent.entity_type in ['company','org','phone'] and is_company(ent,list_sentence[ent.sentence_index].sentence_text)) or (ent.entity_type=='person' and ent.label in [1,2,3]) \
  1513. or (ent.entity_type=='location' and len(ent.entity_text)>5):
  1514. pre_entity.append(ent)
  1515. text_data,pre_data = relationExtraction_model.encode(pre_entity + phone_entitys, list_sentence)
  1516. # print(pre_data)
  1517. maxlen = 512
  1518. relation_list = []
  1519. if 0<len(text_data)<=maxlen:
  1520. relation_list = relationExtraction_model.predict(text_data, pre_data)
  1521. else:
  1522. # 公告大于maxlen时,分段预测
  1523. start = 0
  1524. # print("len(pre_data)",len(pre_data))
  1525. temp_data = []
  1526. deal_data = 0
  1527. while start<len(pre_data):
  1528. _pre_data = pre_data[start:start+maxlen]
  1529. _text_data = text_data[start:start+maxlen]
  1530. if relationExtraction_model.check_data(_pre_data):
  1531. temp_data.append((_text_data,_pre_data))
  1532. else:
  1533. if temp_data:
  1534. deal_data += len(temp_data)
  1535. if deal_data>4:
  1536. break
  1537. for _text_data, _pre_data in temp_data:
  1538. relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data))
  1539. temp_data = []
  1540. start = start + maxlen - 120
  1541. if temp_data:
  1542. deal_data += len(temp_data)
  1543. if deal_data <= 4:
  1544. for _text_data, _pre_data in temp_data:
  1545. relation_list.extend(relationExtraction_model.predict(_text_data, _pre_data))
  1546. # print("预测数据:",len(temp_data))
  1547. # 去重结果
  1548. relation_list = list(set(relation_list))
  1549. # print([(rel[0].entity_text,rel[2].entity_text) for rel in relation_list])
  1550. # relation_list = [] # 放弃原来的模型连接,结果不好控制
  1551. right_combination = [('org','person'),('company','person'),('company','location'),('org','location'),('person','phone')]
  1552. linked_company = set()
  1553. linked_person = set()
  1554. linked_connetPerson = set()
  1555. linked_phone = set()
  1556. for predicate in ["rel_address","rel_phone","rel_person"]:
  1557. _match_list = []
  1558. _match_combo = []
  1559. for relation in relation_list:
  1560. _subject = relation[0]
  1561. _object = relation[2]
  1562. if isinstance(_subject,Entity) and isinstance(_object,Entity) and (_subject.entity_type,_object.entity_type) in right_combination:
  1563. if _subject.in_attachment != _object.in_attachment:
  1564. continue
  1565. if relation[1]==predicate:
  1566. distance = (tokens_num_dict[_object.sentence_index] + _object.begin_index) - (
  1567. tokens_num_dict[_subject.sentence_index] + _subject.end_index)
  1568. if predicate=="rel_person":
  1569. # print(predicate, _subject.entity_text, _object.entity_text)
  1570. if (_subject.label==0 and _object.entity_text in agency_contact ) or (_subject.label==1 and _object.entity_text in tenderee_contact):
  1571. continue
  1572. # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
  1573. # if _subject.label in [2,3,4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系|^联系人|请.{0,4}联系",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-10):_object.wordOffset_begin]):
  1574. if _subject.label in [2,3,4] and re.search(not_win_tenderer_contact,list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-15):_object.wordOffset_begin]):
  1575. # print('not_win_tenderer_contact1')
  1576. continue
  1577. # 角色为招标/代理人,排除"纪检|监察"相关的联系人
  1578. if _subject.label in [0,1] and re.search("纪检|监察|乙方|中标",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin - 10):_object.wordOffset_begin]):
  1579. # if _subject.label in [0,1] and re.search("纪检|监察|乙方|中标",list_sentence[_object.sentence_index].sentence_text[_subject.end_index:_object.wordOffset_begin]):
  1580. continue
  1581. if _object.sentence_index!=0 and _object.wordOffset_begin<=10:
  1582. if _subject.label in [2, 3, 4] and re.search("请.{0,4}联系",
  1583. list_sentence[_object.sentence_index-1].sentence_text[-10:]+
  1584. list_sentence[_object.sentence_index].sentence_text[0:_object.wordOffset_begin]):
  1585. continue
  1586. # 角色为中标候选人,排除距离过远的联系人
  1587. if _subject.label in [2, 3, 4] and distance>=40:
  1588. continue
  1589. if distance>0:
  1590. value = (-1 / 2 * (distance ** 2))/10000
  1591. else:
  1592. distance = abs(distance)
  1593. value = (-1 / 2 * (distance ** 2))
  1594. _match_list.append(Match(_subject,_object,value))
  1595. _match_combo.append((_subject,_object))
  1596. match_result = dispatch(_match_list)
  1597. error_list = []
  1598. for mat in list(set(_match_combo)-set(match_result)):
  1599. for temp in match_result:
  1600. if mat[1]==temp[1] and mat[0]!=temp[0]:
  1601. error_list.append(mat)
  1602. break
  1603. result = list(set(_match_combo)-set(error_list))
  1604. if predicate=='rel_person':
  1605. # 从后往前更新状态,已近后向链接的属性不在前向链接(解决错误链接)
  1606. result = sorted(result,key=lambda x:x[1].begin_index,reverse=True)
  1607. for combo in result:
  1608. is_continue = False
  1609. if not combo[0].pointer_person:
  1610. combo[0].pointer_person = []
  1611. if combo[1].begin_index<combo[0].begin_index:
  1612. if combo[0].pointer_person:
  1613. for temp in combo[0].pointer_person:
  1614. if temp.begin_index>combo[0].begin_index:
  1615. is_continue = True
  1616. break
  1617. if is_continue:
  1618. continue
  1619. combo[0].pointer_person.append(combo[1])
  1620. linked_company.add(combo[0])
  1621. linked_person.add(combo[1])
  1622. # print(1,combo[0].entity_text,combo[1].entity_text)
  1623. if predicate=='rel_address':
  1624. result = sorted(result,key=lambda x:x[1].begin_index,reverse=True)
  1625. for combo in result:
  1626. if combo[0].pointer_address:
  1627. continue
  1628. combo[0].pointer_address = combo[1]
  1629. # print(2,combo[0].entity_text,combo[1].entity_text)
  1630. if predicate=='rel_phone':
  1631. result = sorted(result,key=lambda x:x[1].begin_index,reverse=True)
  1632. for combo in result:
  1633. is_continue = False
  1634. if not combo[0].person_phone:
  1635. combo[0].person_phone = []
  1636. if combo[1].begin_index<combo[0].begin_index:
  1637. if combo[0].person_phone:
  1638. for temp in combo[0].person_phone:
  1639. if temp.begin_index>combo[0].begin_index:
  1640. is_continue = True
  1641. break
  1642. if is_continue:
  1643. continue
  1644. combo[0].person_phone.append(combo[1])
  1645. linked_connetPerson.add(combo[0])
  1646. linked_phone.add(combo[1])
  1647. if combo[0].label in [1,2]:
  1648. if PackDict.get("Project"):
  1649. for i in range(len(PackDict["Project"]["roleList"])):
  1650. if (combo[0].label==1 and PackDict["Project"]["roleList"][i].role_name=='tenderee') \
  1651. or (combo[0].label==2 and PackDict["Project"]["roleList"][i].role_name=='agency'):
  1652. PackDict["Project"]["roleList"][i].linklist.append((combo[0].entity_text,combo[1].entity_text))
  1653. break
  1654. # print(3,combo[0].entity_text,combo[1].entity_text)
  1655. # "公司——地址" 链接规则补充
  1656. company_lacation_EntityList = [ent for ent in pre_entity if ent.entity_type in ['company', 'org', 'location']]
  1657. # company_lacation_EntityList = [ent for ent in pre_entity if (ent.entity_type in ['company', 'org'] and ent.label!=5) or ent.entity_type=="location"]
  1658. company_lacation_EntityList = sorted(company_lacation_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
  1659. t_match_list = []
  1660. for ent_idx in range(len(company_lacation_EntityList)):
  1661. entity = company_lacation_EntityList[ent_idx]
  1662. if entity.entity_type in ['company', 'org'] and entity.label!=5:
  1663. match_nums = 0
  1664. company_nums = 0 # 经过其他公司的数量
  1665. location_nums = 0 # 经过住址的数量
  1666. for after_index in range(ent_idx + 1, min(len(company_lacation_EntityList), ent_idx + 5)):
  1667. after_entity = company_lacation_EntityList[after_index]
  1668. if after_entity.entity_type == "location":
  1669. distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
  1670. tokens_num_dict[entity.sentence_index] + entity.end_index)
  1671. location_nums += 1
  1672. if distance > 100 or location_nums >= 3:
  1673. break
  1674. sentence_distance = after_entity.sentence_index - entity.sentence_index
  1675. value = (-1 / 2 * (distance ** 2)) / 10000
  1676. if sentence_distance == 0:
  1677. if distance < 60:
  1678. t_match_list.append(Match(entity, after_entity, value))
  1679. match_nums += 1
  1680. if company_nums:
  1681. break
  1682. else:
  1683. if distance < 50:
  1684. t_match_list.append(Match(entity, after_entity, value))
  1685. match_nums += 1
  1686. if company_nums:
  1687. break
  1688. else:
  1689. # type:company/org
  1690. company_nums += 1
  1691. if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
  1692. break
  1693. if entity.label in [0, 1] and after_entity.label in [2, 3, 4]:
  1694. break
  1695. if entity.label in [0, 1] and after_entity.label not in [0, 1]:
  1696. break
  1697. # km算法分配求解
  1698. # for item in t_match_list:
  1699. # print("loc_rela",item.main_role.entity_text,item.attribute.entity_text)
  1700. relate_location_result = dispatch(t_match_list)
  1701. relate_location_result = sorted(relate_location_result, key=lambda x: (x[0].sentence_index, x[0].begin_index))
  1702. for match in relate_location_result:
  1703. _company = match[0]
  1704. _relation = match[1]
  1705. # print("loc_relation1", _company.entity_text, _relation.entity_text, )
  1706. if not _company.pointer_address:
  1707. # print('loc_relation2',_company.entity_text,_relation.entity_text)
  1708. _company.pointer_address = _relation
  1709. # "联系人——联系电话" 链接规则补充
  1710. # person_phone_EntityList = [ent for ent in pre_entity+ phone_entitys if ent.entity_type not in ['company','org','location']]
  1711. person_phone_EntityList = [ent for ent in pre_entity+ phone_entitys if ent.entity_type not in ['location']]
  1712. person_phone_EntityList = sorted(person_phone_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
  1713. t_match_list = []
  1714. for ent_idx in range(len(person_phone_EntityList)):
  1715. entity = person_phone_EntityList[ent_idx]
  1716. if entity.entity_type=="person":
  1717. match_nums = 0
  1718. person_nums = 0 # 经过其他中联系人的数量
  1719. byNotPerson_match_nums = 0 # 跟在联系人后面的属性
  1720. phone_nums = 0 # 经过电话的数量
  1721. for after_index in range(ent_idx + 1, min(len(person_phone_EntityList), ent_idx + 8)):
  1722. after_entity = person_phone_EntityList[after_index]
  1723. if after_entity.entity_type == "phone":
  1724. distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
  1725. tokens_num_dict[entity.sentence_index] + entity.end_index)
  1726. phone_nums += 1
  1727. if distance>100 or phone_nums>=4:
  1728. break
  1729. sentence_distance = after_entity.sentence_index - entity.sentence_index
  1730. value = (-1 / 2 * (distance ** 2)) / 10000
  1731. if sentence_distance == 0:
  1732. if distance < 70:
  1733. # value = (-1 / 2 * (distance ** 2)) / 10000
  1734. t_match_list.append(Match(entity, after_entity, value))
  1735. match_nums += 1
  1736. if not person_nums:
  1737. byNotPerson_match_nums += 1
  1738. else:
  1739. break
  1740. else:
  1741. if distance < 30:
  1742. # value = (-1 / 2 * (distance ** 2)) / 10000
  1743. t_match_list.append(Match(entity, after_entity, value))
  1744. match_nums += 1
  1745. if not person_nums:
  1746. byNotPerson_match_nums += 1
  1747. else:
  1748. break
  1749. elif after_entity.entity_type == "person":
  1750. person_nums += 1
  1751. elif after_entity.entity_type in ["company","org"]:
  1752. break
  1753. # 前向查找属性
  1754. if ent_idx != 0 and (not match_nums or not byNotPerson_match_nums):
  1755. previous_entity = person_phone_EntityList[ent_idx - 1]
  1756. if previous_entity.entity_type == 'phone':
  1757. # if previous_entity.sentence_index == entity.sentence_index:
  1758. distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
  1759. tokens_num_dict[previous_entity.sentence_index] + previous_entity.end_index)
  1760. if distance < 30:
  1761. # 前向 没有 /10000
  1762. value = (-1 / 2 * (distance ** 2))
  1763. t_match_list.append(Match(entity, previous_entity, value))
  1764. # km算法分配求解(person-phone)
  1765. t_match_list = [mat for mat in t_match_list if mat.main_role not in linked_connetPerson and mat.attribute not in linked_phone]
  1766. # print([(mat.main_role.entity_text,mat.attribute.entity_text) for mat in t_match_list])
  1767. personphone_result = dispatch(t_match_list)
  1768. personphone_result = sorted(personphone_result, key=lambda x: (x[0].sentence_index, x[0].begin_index))
  1769. for match in personphone_result:
  1770. _person = match[0]
  1771. _phone = match[1]
  1772. if not _person.person_phone:
  1773. _person.person_phone = []
  1774. _person.person_phone.append(_phone)
  1775. # 多个招标人/代理人或者别称
  1776. for idx in range(1,len(pre_entity)):
  1777. _pre_entity = pre_entity[idx]
  1778. if _pre_entity in linked_company and _pre_entity.label==5:
  1779. last_ent = pre_entity[idx-1]
  1780. if last_ent.entity_type in ['company','org'] and last_ent.label in [0,1]:
  1781. if last_ent.sentence_index==_pre_entity.sentence_index:
  1782. mid_text = list_sentence[_pre_entity.sentence_index].sentence_text[last_ent.wordOffset_end:_pre_entity.wordOffset_begin]
  1783. if len(mid_text)<=20 and "," not in mid_text and re.search("[、\((]",mid_text):
  1784. _pre_entity.label = last_ent.label
  1785. _pre_entity.values[last_ent.label] = 0.6
  1786. # 2022/01/25 固定电话可连多个联系人
  1787. temp_person_entitys = [entity for entity in pre_entity if entity.entity_type == 'person']
  1788. temp_person_entitys2 = [] #和固定电话相连的联系人
  1789. for entity in temp_person_entitys:
  1790. if entity.person_phone:
  1791. for _phone in entity.person_phone:
  1792. if not re.search("^1[3-9]\d{9}$", _phone.entity_text):
  1793. temp_person_entitys2.append(entity)
  1794. break
  1795. for index in range(len(temp_person_entitys)):
  1796. entity = temp_person_entitys[index]
  1797. if entity in temp_person_entitys2:
  1798. last_person = entity
  1799. for after_index in range(index + 1, min(len(temp_person_entitys), index + 5)):
  1800. after_entity = temp_person_entitys[after_index]
  1801. if after_entity.sentence_index == last_person.sentence_index and after_entity.begin_index - last_person.end_index < 3:
  1802. for _phone in entity.person_phone:
  1803. if not re.search("^1[3-9]\d{9}$", _phone.entity_text):
  1804. if _phone not in after_entity.person_phone:
  1805. after_entity.person_phone.append(_phone)
  1806. last_person = after_entity
  1807. else:
  1808. break
  1809. if index==0:
  1810. continue
  1811. last_person = entity
  1812. for before_index in range(index-1, max(-1,index-5), -1):
  1813. before_entity = temp_person_entitys[before_index]
  1814. if before_entity.sentence_index == last_person.sentence_index and last_person.begin_index - before_entity.end_index < 3:
  1815. for _phone in entity.person_phone:
  1816. if not re.search("^1[3-9]\d{9}$", _phone.entity_text):
  1817. if _phone not in before_entity.person_phone:
  1818. before_entity.person_phone.append(_phone)
  1819. last_person = before_entity
  1820. else:
  1821. break
  1822. # 更新person为招标/代理联系人的联系方式
  1823. for k in PackDict.keys():
  1824. for i in range(len(PackDict[k]["roleList"])):
  1825. if PackDict[k]["roleList"][i].role_name == "tenderee":
  1826. for _person in person_list:
  1827. if _person.label==1:#招标联系人
  1828. person_phone = [phone for phone in _person.person_phone] if _person.person_phone else []
  1829. for _p in person_phone:
  1830. PackDict[k]["roleList"][i].linklist.append((_person.entity_text, _p.entity_text))
  1831. if not person_phone:
  1832. PackDict[k]["roleList"][i].linklist.append((_person.entity_text,""))
  1833. if PackDict[k]["roleList"][i].role_name == "agency":
  1834. for _person in person_list:
  1835. if _person.label==2:#代理联系人
  1836. person_phone = [phone for phone in _person.person_phone] if _person.person_phone else []
  1837. for _p in person_phone:
  1838. PackDict[k]["roleList"][i].linklist.append((_person.entity_text, _p.entity_text))
  1839. if not person_phone:
  1840. PackDict[k]["roleList"][i].linklist.append((_person.entity_text,""))
  1841. # 更新 PackDict
  1842. not_sure_linked = []
  1843. for link_p in list(linked_company):
  1844. for k in PackDict.keys():
  1845. for i in range(len(PackDict[k]["roleList"])):
  1846. if PackDict[k]["roleList"][i].role_name == "tenderee":
  1847. if PackDict[k]["roleList"][i].entity_text != link_p.entity_text and link_p.label == 0:
  1848. not_sure_linked.append(link_p)
  1849. continue
  1850. if PackDict[k]["roleList"][i].entity_text == link_p.entity_text:
  1851. for per in link_p.pointer_person:
  1852. person_phone = [phone for phone in per.person_phone] if per.person_phone else []
  1853. if not person_phone:
  1854. if per.entity_text not in agency_contact:
  1855. PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
  1856. continue
  1857. for _p in person_phone:
  1858. if per.entity_text not in agency_contact and _p.entity_text not in agency_phone:
  1859. PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
  1860. elif PackDict[k]["roleList"][i].role_name == "agency":
  1861. if PackDict[k]["roleList"][i].entity_text != link_p.entity_text and link_p.label == 1:
  1862. not_sure_linked.append(link_p)
  1863. continue
  1864. if PackDict[k]["roleList"][i].entity_text == link_p.entity_text:
  1865. for per in link_p.pointer_person:
  1866. person_phone = [phone for phone in per.person_phone] if per.person_phone else []
  1867. if not person_phone:
  1868. if per.entity_text not in tenderee_contact:
  1869. PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
  1870. continue
  1871. for _p in person_phone:
  1872. if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone:
  1873. PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
  1874. else:
  1875. if PackDict[k]["roleList"][i].entity_text == link_p.entity_text:
  1876. for per in link_p.pointer_person:
  1877. person_phone = [phone for phone in per.person_phone] if per.person_phone else []
  1878. if not person_phone:
  1879. if per.entity_text not in tenderee_contact and per.entity_text not in agency_contact:
  1880. # 角色为中标候选人,联系人无号码且上文没有联系关键词时排除
  1881. if re.search("联系人|联系方式|电话|负责人|经理|法人|法定代表人",list_sentence[per.sentence_index].sentence_text[max(0, per.wordOffset_begin - 10):per.wordOffset_begin]):
  1882. PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
  1883. winter_contact.add(per.entity_text)
  1884. continue
  1885. for _p in person_phone:
  1886. if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and \
  1887. per.entity_text not in agency_contact and _p.entity_text not in agency_phone:
  1888. PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
  1889. winter_contact.add(per.entity_text)
  1890. # 更新org/company实体label为0,1的链接
  1891. for link_p in not_sure_linked:
  1892. for k in PackDict.keys():
  1893. for i in range(len(PackDict[k]["roleList"])):
  1894. if PackDict[k]["roleList"][i].role_name == "tenderee":
  1895. if link_p.label == 0:
  1896. for per in link_p.pointer_person:
  1897. person_phone = [phone for phone in per.person_phone] if per.person_phone else []
  1898. if not person_phone:
  1899. if per.entity_text not in agency_contact and per.entity_text not in winter_contact:
  1900. PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
  1901. continue
  1902. for _p in person_phone:
  1903. if per.entity_text not in agency_contact and _p.entity_text not in agency_phone and per.entity_text not in winter_contact:
  1904. PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
  1905. elif PackDict[k]["roleList"][i].role_name == "agency":
  1906. if link_p.label == 1:
  1907. for per in link_p.pointer_person:
  1908. person_phone = [phone for phone in per.person_phone] if per.person_phone else []
  1909. if not person_phone:
  1910. if per.entity_text not in tenderee_contact and per.entity_text not in winter_contact:
  1911. PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
  1912. continue
  1913. for _p in person_phone:
  1914. if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and per.entity_text not in winter_contact:
  1915. PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
  1916. # 使用中标信息大纲提取联系人
  1917. winter_scope_group = []
  1918. if winter_scope:
  1919. winter_scope_begin = winter_scope[0]
  1920. winter_scope_end = winter_scope[1]
  1921. # print(list_sentence[winter_scope_begin[0]].sentence_text[winter_scope_begin[1]:winter_scope_end[1]])
  1922. winter_temporary_list = []
  1923. for entity in list_entity:
  1924. if entity.entity_type in ['org', 'company', 'person']:
  1925. winter_temporary_list.append(entity)
  1926. winter_temporary_list = sorted(winter_temporary_list, key=lambda x: (x.sentence_index, x.begin_index))
  1927. winter_temporary_list2 = []
  1928. for _entity in winter_temporary_list:
  1929. if _entity.sentence_index>=winter_scope_begin[0] and _entity.sentence_index<=winter_scope_end[0]:
  1930. if (_entity.sentence_index==winter_scope_begin[0] and _entity.wordOffset_begin>=winter_scope_begin[1]) or \
  1931. _entity.sentence_index>winter_scope_begin[0]:
  1932. if (_entity.sentence_index == winter_scope_end[0] and _entity.wordOffset_end<=winter_scope_end[1]) or \
  1933. _entity.sentence_index<winter_scope_end[0]:
  1934. winter_temporary_list2.append(_entity)
  1935. # print('winter_scope_entity',[i.entity_text for i in winter_temporary_list2])
  1936. winter_scope_group = winter_temporary_list2
  1937. match_list_winter = []
  1938. for index in range(len(winter_scope_group)):
  1939. entity = winter_scope_group[index]
  1940. if entity.entity_type in ['company','org']:
  1941. match_nums = 0
  1942. for after_index in range(index + 1, min(len(winter_scope_group), index + 4)):
  1943. after_entity = winter_scope_group[after_index]
  1944. if match_nums > 2:
  1945. break
  1946. if after_entity.entity_type == 'person':
  1947. distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
  1948. tokens_num_dict[entity.sentence_index] + entity.end_index)
  1949. # 实体为中标人/候选人,联系人已确定类别【1,2】
  1950. if entity.label in [2, 3, 4] and after_entity.label in [1, 2]:
  1951. break
  1952. if entity.label in [2, 3, 4] and distance >= 30:
  1953. break
  1954. # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
  1955. if entity.label in [2, 3, 4] and re.search(not_win_tenderer_contact, list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 15):after_entity.wordOffset_begin]):
  1956. break
  1957. # 角色为中标候选人,联系人无号码且上文没有联系关键词时排除
  1958. if entity.label in [2, 3, 4] and not after_entity.person_phone and not re.search(
  1959. "联系人|联系方式|电话|负责人|经理|法人|法定代表人", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
  1960. continue
  1961. # 角色为招标/代理人,排除"纪检|监察"相关的联系人
  1962. if entity.label in [0, 1] and re.search("纪检|监察|乙方|中标", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
  1963. break
  1964. if after_entity.sentence_index != 0 and after_entity.wordOffset_begin <= 10:
  1965. if entity.label in [2, 3, 4] and re.search("请.{0,5}联系",list_sentence[after_entity.sentence_index - 1].sentence_text[-10:] +
  1966. list_sentence[after_entity.sentence_index].sentence_text[0:after_entity.wordOffset_begin]):
  1967. continue
  1968. if distance < 80:
  1969. if (entity.label == 0 and after_entity.label == 1) or (
  1970. entity.label == 1 and after_entity.label == 2):
  1971. distance = distance / 100
  1972. value = (-1 / 2 * (distance ** 2)) / 10000
  1973. match_list_winter.append(Match(entity, after_entity, value))
  1974. match_nums += 1
  1975. # 前向查找匹配
  1976. if index != 0:
  1977. previous_entity = winter_scope_group[index - 1]
  1978. if previous_entity.entity_type == 'person' and previous_entity.label in [1,2,3]:
  1979. if entity.label in [2, 3, 4] and previous_entity.label in [1, 2]:
  1980. continue
  1981. # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
  1982. if entity.label in [2, 3, 4] and re.search(not_win_tenderer_contact, list_sentence[previous_entity.sentence_index].sentence_text[
  1983. max(0,previous_entity.wordOffset_begin - 15):previous_entity.wordOffset_begin]):
  1984. break
  1985. # 角色为中标候选人,联系人无号码且上文没有联系关键词时排除
  1986. if entity.label in [2, 3, 4] and not previous_entity.person_phone and not re.search(
  1987. "联系人|联系方式|电话|负责人|经理|法人|法定代表人",list_sentence[previous_entity.sentence_index].sentence_text[
  1988. max(0, previous_entity.wordOffset_begin - 10):previous_entity.wordOffset_begin]):
  1989. continue
  1990. # 角色为招标/代理人,排除"纪检|监察"相关的联系人
  1991. if entity.label in [0, 1] and re.search("纪检|监察|乙方|中标", list_sentence[previous_entity.sentence_index].sentence_text[
  1992. max(0,previous_entity.wordOffset_begin - 10):previous_entity.wordOffset_begin]):
  1993. break
  1994. if previous_entity.sentence_index == entity.sentence_index:
  1995. distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
  1996. tokens_num_dict[
  1997. previous_entity.sentence_index] + previous_entity.end_index)
  1998. if distance < 30:
  1999. # 距离相等时,前向添加处罚值
  2000. # distance += 1
  2001. # 前向 没有 /10000
  2002. value = (-1 / 2 * (distance ** 2))
  2003. match_list_winter.append(Match(entity, previous_entity, value))
  2004. # test
  2005. # match_list_winter = company_contact_link([winter_scope_group])
  2006. # km算法分配求解
  2007. result_winter = dispatch(match_list_winter)
  2008. for match in result_winter:
  2009. _company = match[0]
  2010. _person = match[1]
  2011. _person = _person.entity_text
  2012. # 更新中标人联系方式
  2013. if _company.label==2:
  2014. phone_ = [i.entity_text for i in match[1].person_phone] if match[1].person_phone else []
  2015. for k in PackDict.keys():
  2016. for i in range(len(PackDict[k]["roleList"])):
  2017. if PackDict[k]["roleList"][i].role_name == "win_tenderer":
  2018. if PackDict[k]["roleList"][i].entity_text == _company.entity_text:
  2019. if _person not in tenderee_contact and len(set(phone_) & set(tenderee_phone)) == 0 and \
  2020. _person not in agency_contact and len(set(phone_) & set(agency_phone)) == 0:
  2021. if not phone_:
  2022. PackDict[k]["roleList"][i].linklist.append((_person, ""))
  2023. for p in phone_:
  2024. PackDict[k]["roleList"][i].linklist.append((_person, p))
  2025. if phone_:
  2026. for p in phone_:
  2027. rule_winter_phone.add(p)
  2028. # print('rule_winter_phone',rule_winter_phone)
  2029. re_split = re.compile("[^\u4e00-\u9fa5、](十一|十二|十三|十四|十五|一|二|三|四|五|六|七|八|九|十)、")
  2030. split_list = [0] * 16
  2031. split_dict = {
  2032. "一、": 1,
  2033. "二、": 2,
  2034. "三、": 3,
  2035. "四、": 4,
  2036. "五、": 5,
  2037. "六、": 6,
  2038. "七、": 7,
  2039. "八、": 8,
  2040. "九、": 9,
  2041. "十、": 10,
  2042. "十一、": 11,
  2043. "十二、": 12,
  2044. "十三、": 13,
  2045. "十四、": 14,
  2046. "十五、": 15
  2047. }
  2048. for item in re.finditer(re_split, _content):
  2049. _index = split_dict.get(item.group()[1:])
  2050. if not split_list[_index]:
  2051. split_list[_index] = item.span()[0] + 1
  2052. split_list = [i for i in split_list if i != 0]
  2053. start = 0
  2054. new_split_list = []
  2055. for idx in split_list:
  2056. new_split_list.append((start, idx))
  2057. start = idx
  2058. new_split_list.append((start, len(_content)))
  2059. # 实体列表按照“公告分段”分组
  2060. words_num_dict = dict()
  2061. last_words_num = 0
  2062. for sentence in list_sentence:
  2063. _index = sentence.sentence_index
  2064. if _index == 0:
  2065. words_num_dict[_index] = 0
  2066. else:
  2067. words_num_dict[_index] = words_num_dict[_index - 1] + last_words_num
  2068. last_words_num = len(sentence.sentence_text)
  2069. # 公司-联系人连接(km算法)
  2070. re_phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
  2071. '\+86.?1[3-9]\d{9}|'
  2072. # '0[1-9]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
  2073. '0[1-9]\d{1,2}[-—-―][2-9]\d{6,7}[^\d]?转\d{1,4}|'
  2074. '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?[-—-―]\d{1,4}|'
  2075. '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=1[3-9]\d{9})|'
  2076. '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?)|'
  2077. '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=[2-9]\d{6,7})|'
  2078. '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?|'
  2079. '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6,7}-?\d{,4}|'
  2080. '400\d{7}转\d{1,4}|'
  2081. '[2-9]\d{6,7}')
  2082. key_phone = re.compile("联系方式|电话|联系人|负责人")
  2083. temporary_list2 = []
  2084. for entity in list_entity:
  2085. # if entity.entity_type in ['org', 'company', 'person'] and entity.is_tail==False:
  2086. if entity.entity_type in ['org', 'company', 'person']:
  2087. temporary_list2.append(entity)
  2088. temporary_list2 = sorted(temporary_list2, key=lambda x: (x.sentence_index, x.begin_index))
  2089. new_temporary_list2 = []
  2090. for _split in new_split_list:
  2091. temp_list = []
  2092. for _entity in temporary_list2:
  2093. if words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[0] and words_num_dict[
  2094. _entity.sentence_index] + _entity.wordOffset_end < _split[1]:
  2095. temp_list.append(_entity)
  2096. elif words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[1]:
  2097. break
  2098. new_temporary_list2.append(temp_list)
  2099. # print(new_temporary_list2)
  2100. match_list2 = []
  2101. for split_index in range(len(new_temporary_list2)):
  2102. split_entitys = new_temporary_list2[split_index]
  2103. if len(split_entitys)<=1:
  2104. continue
  2105. is_skip = False
  2106. for index in range(len(split_entitys)):
  2107. entity = split_entitys[index]
  2108. if is_skip:
  2109. is_skip = False
  2110. continue
  2111. else:
  2112. if entity.entity_type in ['org', 'company']:
  2113. if entity.label != 5 or entity.entity_text in roleSet:
  2114. match_nums = 0
  2115. for after_index in range(index + 1, min(len(split_entitys), index + 4)):
  2116. after_entity = split_entitys[after_index]
  2117. if entity.in_attachment != after_entity.in_attachment:
  2118. break
  2119. if after_entity.entity_type in ['person']:
  2120. distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
  2121. tokens_num_dict[entity.sentence_index] + entity.end_index)
  2122. # 实体为中标人/候选人,联系人已确定类别【1,2】
  2123. if entity.label in [2, 3, 4] and after_entity.label in [1, 2]:
  2124. break
  2125. if entity.label in [2, 3, 4] and distance>=30:
  2126. break
  2127. # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
  2128. # if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位|业主)?联系|(采购|招标)人?联系|请.{0,4}联系", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
  2129. if entity.label in [2, 3, 4] and re.search(not_win_tenderer_contact, list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 15):after_entity.wordOffset_begin]):
  2130. # print('not_win_tenderer_contact2')
  2131. break
  2132. # 角色为中标候选人,联系人无号码且上文没有联系关键词时排除
  2133. # print('test',after_entity.entity_text,after_entity.person_phone,list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin])
  2134. if entity.label in [2, 3, 4] and not after_entity.person_phone and not re.search("联系人|联系方式|电话|负责人|经理|法人|法定代表人",list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
  2135. continue
  2136. # 角色为招标/代理人,排除"纪检|监察"相关的联系人
  2137. if entity.label in [0,1] and re.search("纪检|监察|乙方|中标",list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
  2138. break
  2139. if after_entity.sentence_index != 0 and after_entity.wordOffset_begin <= 10:
  2140. if entity.label in [2, 3, 4] and re.search("请.{0,5}联系",
  2141. list_sentence[after_entity.sentence_index - 1].sentence_text[-10:] +
  2142. list_sentence[after_entity.sentence_index].sentence_text[0:after_entity.wordOffset_begin]):
  2143. continue
  2144. if after_entity.label in [1, 2, 3]:
  2145. # distance = (tokens_num_dict[
  2146. # after_entity.sentence_index] + after_entity.begin_index) - (
  2147. # tokens_num_dict[entity.sentence_index] + entity.end_index)
  2148. sentence_distance = after_entity.sentence_index - entity.sentence_index
  2149. if sentence_distance == 0:
  2150. if distance < 100:
  2151. if entity.label in [2, 3, 4] and distance>40:
  2152. break
  2153. if (entity.label == 0 and after_entity.label == 1) or (
  2154. entity.label == 1 and after_entity.label == 2):
  2155. distance = distance / 100
  2156. value = (-1 / 2 * (distance ** 2)) / 10000
  2157. match_list2.append(Match(entity, after_entity, value))
  2158. match_nums += 1
  2159. else:
  2160. if distance < 60:
  2161. if entity.label in [2, 3, 4] and distance>20:
  2162. break
  2163. if (entity.label == 0 and after_entity.label == 1) or (
  2164. entity.label == 1 and after_entity.label == 2):
  2165. distance = distance / 100
  2166. value = (-1 / 2 * (distance ** 2)) / 10000
  2167. match_list2.append(Match(entity, after_entity, value))
  2168. match_nums += 1
  2169. if after_entity.entity_type in ['org', 'company']:
  2170. if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
  2171. break
  2172. # 解决在‘地址’中识别出org/company的问题
  2173. # if entity.label in [0,1] and after_index==index+1 and after_entity.label not in [0,1]:
  2174. if entity.label != 5 and after_index == index + 1 and (
  2175. after_entity.label == entity.label or after_entity.label == 5):
  2176. distance = (tokens_num_dict[
  2177. after_entity.sentence_index] + after_entity.begin_index) - (
  2178. tokens_num_dict[entity.sentence_index] + entity.end_index)
  2179. if distance < 20:
  2180. after_entity_left = list_sentence[after_entity.sentence_index].tokens[max(0,
  2181. after_entity.begin_index - 10):after_entity.begin_index]
  2182. after_entity_right = list_sentence[after_entity.sentence_index].tokens[
  2183. after_entity.end_index + 1:after_entity.end_index + 6]
  2184. after_entity_left = "".join(after_entity_left)
  2185. if len(after_entity_left) > 20:
  2186. after_entity_left = after_entity_left[-20:]
  2187. after_entity_right = "".join(after_entity_right)[:10]
  2188. if re.search("地,?址", after_entity_left):
  2189. is_skip = True
  2190. continue
  2191. if re.search("\(|(", after_entity_left) and re.search("\)|)",after_entity_right):
  2192. is_skip = True
  2193. continue
  2194. if entity.label in [0, 1] and after_entity.label in [0, 1] and entity.label == after_entity.label:
  2195. break
  2196. if entity.label in [0, 1] and after_entity.label in [0, 1] and split_entitys[
  2197. index + 1].entity_type == "person":
  2198. break
  2199. if entity.label in [0, 1 ,5] and after_entity.label in [2, 3, 4]:
  2200. break
  2201. if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
  2202. break
  2203. # 搜索没有联系人的电话
  2204. mid_tokens = []
  2205. is_same_sentence = False
  2206. if index == len(split_entitys) - 1:
  2207. for i in range(entity.sentence_index, len(list_sentence)):
  2208. mid_tokens += list_sentence[i].tokens
  2209. mid_tokens = mid_tokens[entity.end_index + 1:]
  2210. mid_sentence = "".join(mid_tokens)
  2211. have_phone = re.findall(re_phone, mid_sentence)
  2212. if have_phone:
  2213. if re.findall(re_phone, mid_sentence.split("。")[0]):
  2214. is_same_sentence = True
  2215. _phone = have_phone[0]
  2216. if _phone in [ent.entity_text for ent in phone_entitys]:
  2217. phone_begin = mid_sentence.find(_phone)
  2218. if words_num_dict[entity.sentence_index] + entity.wordOffset_begin + phone_begin < \
  2219. new_split_list[split_index][1]:
  2220. mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "")
  2221. if re.search(key_phone, mid_sentence):
  2222. # if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位|业主)?联系|(采购|招标)人?联系|请.{0,4}联系",mid_sentence[-10:]):
  2223. if entity.label in [2, 3, 4] and re.search(not_win_tenderer_contact,mid_sentence[-15:]):
  2224. # print('not_win_tenderer_contact3')
  2225. pass
  2226. else:
  2227. distance = 1
  2228. if is_same_sentence:
  2229. if phone_begin <= 200:
  2230. if entity.label in [2,3,4] and phone_begin>80:
  2231. break
  2232. value = (-1 / 2 * (distance ** 2)) / 10000
  2233. match_list2.append(Match(entity, (entity, _phone), value))
  2234. match_nums += 1
  2235. else:
  2236. if phone_begin <= 60:
  2237. if entity.label in [2,3,4] and phone_begin>40:
  2238. break
  2239. value = (-1 / 2 * (distance ** 2)) / 10000
  2240. match_list2.append(Match(entity, (entity, _phone), value))
  2241. match_nums += 1
  2242. else:
  2243. next_entity = split_entitys[index + 1]
  2244. if next_entity.entity_type in ["org","company"]:
  2245. _entity_left = list_sentence[next_entity.sentence_index].sentence_text[entity.wordOffset_end:next_entity.wordOffset_begin]
  2246. _entity_left2 = re.sub(",()\(\)::", "", _entity_left)
  2247. _entity_left2 = _entity_left2[-5:]
  2248. if re.search("(地,?址|地,?点)[::][^,。]*$", _entity_left) or re.search("地址|地点", _entity_left2):
  2249. if index + 2<= len(split_entitys) - 1:
  2250. next_entity = split_entitys[index + 2]
  2251. if len(_entity_left)<=2 and re.search("[、(\(]",_entity_left):
  2252. if index + 2 <= len(split_entitys) - 1:
  2253. next_entity = split_entitys[index + 2]
  2254. if entity.sentence_index == next_entity.sentence_index:
  2255. mid_tokens += list_sentence[entity.sentence_index].tokens[
  2256. entity.end_index + 1:next_entity.begin_index]
  2257. else:
  2258. sentence_index = entity.sentence_index
  2259. while sentence_index <= next_entity.sentence_index:
  2260. mid_tokens += list_sentence[sentence_index].tokens
  2261. sentence_index += 1
  2262. mid_tokens = mid_tokens[entity.end_index + 1:-(len(
  2263. list_sentence[next_entity.sentence_index].tokens) - next_entity.begin_index) + 1]
  2264. mid_sentence = "".join(mid_tokens)
  2265. have_phone = re.findall(re_phone, mid_sentence)
  2266. if have_phone:
  2267. if re.findall(re_phone, mid_sentence.split("。")[0]):
  2268. is_same_sentence = True
  2269. _phone = have_phone[0]
  2270. if _phone in [ent.entity_text for ent in phone_entitys]:
  2271. phone_begin = mid_sentence.find(_phone)
  2272. mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "")
  2273. if re.search(key_phone, mid_sentence):
  2274. p_phone = [p.entity_text for p in next_entity.person_phone] if next_entity.person_phone else []
  2275. if next_entity.entity_type == 'person' and _phone in p_phone:
  2276. pass
  2277. # elif entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位|业主)?联系|(采购|招标)人?联系|请.{0,4}联系", mid_sentence[-10:]):
  2278. elif entity.label in [2, 3, 4] and re.search(not_win_tenderer_contact, mid_sentence[-15:]):
  2279. # print('not_win_tenderer_contact4')
  2280. pass
  2281. else:
  2282. distance = (tokens_num_dict[
  2283. next_entity.sentence_index] + next_entity.begin_index) - (
  2284. tokens_num_dict[entity.sentence_index] + entity.end_index)
  2285. distance = distance / 2
  2286. if is_same_sentence:
  2287. if phone_begin <= 200:
  2288. value = (-1 / 2 * (distance ** 2)) / 10000
  2289. match_list2.append(Match(entity, (entity, _phone), value))
  2290. match_nums += 1
  2291. else:
  2292. if phone_begin <= 60:
  2293. value = (-1 / 2 * (distance ** 2)) / 10000
  2294. match_list2.append(Match(entity, (entity, _phone), value))
  2295. match_nums += 1
  2296. # 实体无匹配时,尝试前向查找匹配
  2297. if not match_nums:
  2298. if (entity.label != 5 or entity.entity_text in roleSet) and entity.values[entity.label] >= 0.5 and index != 0:
  2299. previous_entity = split_entitys[index - 1]
  2300. if previous_entity.entity_type == 'person' and previous_entity.label in [1, 2, 3]:
  2301. if entity.label in [2, 3, 4] and previous_entity.label in [1, 2]:
  2302. continue
  2303. # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
  2304. if entity.label in [2, 3, 4] and re.search(not_win_tenderer_contact,list_sentence[previous_entity.sentence_index].sentence_text[
  2305. max(0,previous_entity.wordOffset_begin - 15):previous_entity.wordOffset_begin]):
  2306. # print('not_win_tenderer_contact2')
  2307. break
  2308. # 角色为中标候选人,联系人无号码且上文没有联系关键词时排除
  2309. if entity.label in [2, 3,4] and not previous_entity.person_phone and not re.search("联系人|联系方式|电话|负责人|经理|法人|法定代表人",
  2310. list_sentence[previous_entity.sentence_index].sentence_text[max(0,previous_entity.wordOffset_begin - 10):previous_entity.wordOffset_begin]):
  2311. continue
  2312. # 角色为招标/代理人,排除"纪检|监察"相关的联系人
  2313. if entity.label in [0, 1] and re.search("纪检|监察|乙方|中标", list_sentence[previous_entity.sentence_index].sentence_text[
  2314. max(0,previous_entity.wordOffset_begin - 10):previous_entity.wordOffset_begin]):
  2315. break
  2316. if previous_entity.sentence_index == entity.sentence_index:
  2317. distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
  2318. tokens_num_dict[
  2319. previous_entity.sentence_index] + previous_entity.end_index)
  2320. if distance < 20:
  2321. # 距离相等时,前向添加处罚值
  2322. # distance += 1
  2323. # 前向 没有 /10000
  2324. value = (-1 / 2 * (distance ** 2))
  2325. match_list2.append(Match(entity, previous_entity, value))
  2326. # print(match_list2)
  2327. # print([(mat.main_role.entity_text,mat.attribute.entity_text if not isinstance(mat.attribute, tuple) else mat.attribute[1]) for mat in match_list2])
  2328. match_list2 = [mat for mat in match_list2 if mat.main_role not in linked_company and mat.attribute not in linked_person]
  2329. # print(match_list2)
  2330. # print([(mat.main_role.entity_text,mat.attribute.entity_text if not isinstance(mat.attribute, tuple) else mat.attribute[1]) for mat in match_list2])
  2331. # km算法分配求解
  2332. result2 = dispatch(match_list2)
  2333. # print(result2)
  2334. for match in result2:
  2335. entity = match[0]
  2336. # print(entity.entity_text)
  2337. # print(entity.label)
  2338. # print(match.attribute)
  2339. entity_index = list_entity.index(entity)
  2340. is_update = False
  2341. if isinstance(match[1], tuple):
  2342. person_ = ''
  2343. phone_ = match[1][1].split("/") # 分割多个号码
  2344. # print(person_,phone_)
  2345. else:
  2346. person_ = match[1].entity_text
  2347. phone_ = [i.entity_text for i in match[1].person_phone] if match[1].person_phone else []
  2348. for k in PackDict.keys():
  2349. for i in range(len(PackDict[k]["roleList"])):
  2350. if PackDict[k]["roleList"][i].role_name == "tenderee":
  2351. # if not PackDict[k]["roleList"][i].linklist:
  2352. if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 0:
  2353. if person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0 and person_ not in winter_contact:
  2354. if not phone_:
  2355. PackDict[k]["roleList"][i].linklist.append((person_, ""))
  2356. for p in phone_:
  2357. # if not person_ and len()
  2358. PackDict[k]["roleList"][i].linklist.append((person_, p))
  2359. is_update = True
  2360. elif PackDict[k]["roleList"][i].role_name == "agency":
  2361. # if not PackDict[k]["roleList"][i].linklist:
  2362. if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 1 and person_ not in winter_contact:
  2363. if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0:
  2364. if not phone_:
  2365. PackDict[k]["roleList"][i].linklist.append((person_, ""))
  2366. for p in phone_:
  2367. PackDict[k]["roleList"][i].linklist.append((person_, p))
  2368. is_update = True
  2369. else:
  2370. if PackDict[k]["roleList"][i].entity_text == entity.entity_text:
  2371. # if not PackDict[k]["roleList"][i].linklist:
  2372. if len([item for item in PackDict[k]["roleList"][i].linklist if item[1]])==0: # 有联系人但无联系方式(号码)
  2373. if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0 and \
  2374. person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0:
  2375. if not phone_:
  2376. PackDict[k]["roleList"][i].linklist.append((person_, ""))
  2377. for p in phone_:
  2378. PackDict[k]["roleList"][i].linklist.append((person_, p))
  2379. is_update = True
  2380. if not person_:
  2381. is_update = False
  2382. if is_update:
  2383. # 更新 list_entity
  2384. if not list_entity[entity_index].pointer_person:
  2385. list_entity[entity_index].pointer_person = []
  2386. list_entity[entity_index].pointer_person.append(match[1])
  2387. # print('tenderee_contact',tenderee_contact)
  2388. # print('tenderee_phone',tenderee_phone)
  2389. # print('agency_contact',agency_contact)
  2390. # print('agency_phone',agency_phone)
  2391. # print('PackDict')
  2392. # for k in PackDict.keys():
  2393. # for i in range(len(PackDict[k]["roleList"])):
  2394. # print(PackDict[k]["roleList"][i].role_name)
  2395. # print(PackDict[k]["roleList"][i].entity_text)
  2396. # print(PackDict[k]["roleList"][i].linklist)
  2397. linked_person = []
  2398. linked_persons_with = []
  2399. for company_entity in [entity for entity in list_entity if entity.entity_type in ['company','org']]:
  2400. if company_entity.pointer_person:
  2401. for _person in company_entity.pointer_person:
  2402. linked_person.append(_person)
  2403. linked_persons_with.append(company_entity)
  2404. # 一个公司对应多个联系人的补充
  2405. person_entitys = [entity for entity in list_entity if entity.entity_type=='person']
  2406. person_entitys = person_entitys[::-1]
  2407. for index in range(len(person_entitys)):
  2408. entity = person_entitys[index]
  2409. prepare_link = []
  2410. if entity not in linked_person:
  2411. prepare_link.append(entity)
  2412. last_person = entity
  2413. for after_index in range(index + 1, min(len(person_entitys), index + 5)):
  2414. after_entity = person_entitys[after_index]
  2415. if after_entity.sentence_index==last_person.sentence_index and last_person.begin_index-after_entity.end_index<5:
  2416. if after_entity in linked_person:
  2417. _index = linked_person.index(after_entity)
  2418. with_company = linked_persons_with[_index]
  2419. for i in range(len(PackDict["Project"]["roleList"])):
  2420. if PackDict["Project"]["roleList"][i].role_name == "tenderee":
  2421. if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text or with_company.label == 0:
  2422. for item in prepare_link:
  2423. person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else []
  2424. for _p in person_phone:
  2425. PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p))
  2426. with_company.pointer_person.append(item)
  2427. linked_person.append(item)
  2428. elif PackDict["Project"]["roleList"][i].role_name == "agency":
  2429. if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text or with_company.label == 1:
  2430. for item in prepare_link:
  2431. person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else []
  2432. for _p in person_phone:
  2433. PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p))
  2434. with_company.pointer_person.append(item)
  2435. linked_person.append(item)
  2436. else:
  2437. if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text:
  2438. for item in prepare_link:
  2439. person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else []
  2440. for _p in person_phone:
  2441. PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p))
  2442. with_company.pointer_person.append(item)
  2443. linked_person.append(item)
  2444. break
  2445. else:
  2446. prepare_link.append(after_entity)
  2447. last_person = after_entity
  2448. continue
  2449. # 统一同类角色的属性
  2450. for k in PackDict.keys():
  2451. for i in range(len(PackDict[k]["roleList"])):
  2452. for _entity in list_entity:
  2453. if _entity.entity_type in ['org','company']:
  2454. is_same = False
  2455. is_similar = False
  2456. # entity_text相同
  2457. if _entity.entity_text==PackDict[k]["roleList"][i].entity_text:
  2458. is_same = True
  2459. # entity.label为【0,1】
  2460. if _entity.label in [0,1] and dict_role_id[str(_entity.label)]==PackDict[k]["roleList"][i].role_name:
  2461. is_similar = True
  2462. if is_same:
  2463. linked_entitys = _entity.linked_entitys
  2464. if linked_entitys:
  2465. for linked_entity in linked_entitys:
  2466. pointer_person = linked_entity.pointer_person if linked_entity.pointer_person else []
  2467. for _pointer_person in pointer_person:
  2468. _phone = [p.entity_text for p in _pointer_person.person_phone] if _pointer_person.person_phone else []
  2469. for _p in _phone:
  2470. if (_pointer_person.entity_text,_p) not in PackDict[k]["roleList"][i].linklist:
  2471. PackDict[k]["roleList"][i].linklist.append((_pointer_person.entity_text,_p))
  2472. elif is_similar:
  2473. pointer_person = _entity.pointer_person if _entity.pointer_person else []
  2474. for _pointer_person in pointer_person:
  2475. _phone = [p.entity_text for p in _pointer_person.person_phone] if _pointer_person.person_phone else []
  2476. for _p in _phone:
  2477. if (_pointer_person.entity_text, _p) not in PackDict[k]["roleList"][i].linklist:
  2478. PackDict[k]["roleList"][i].linklist.append(
  2479. (_pointer_person.entity_text, _p))
  2480. # "roleList"中联系人电话去重
  2481. tenderee_agency_phone = []
  2482. tenderee_agency_contact = []
  2483. for k in PackDict.keys():
  2484. for i in range(len(PackDict[k]["roleList"])):
  2485. if PackDict[k]["roleList"][i].role_name in ['agency','tenderee']:
  2486. tenderee_agency_phone.extend([person_phone[1] for person_phone in PackDict[k]["roleList"][i].linklist if person_phone[1]])
  2487. tenderee_agency_contact.extend([person_phone[0]+'-'+person_phone[1] for person_phone in PackDict[k]["roleList"][i].linklist])
  2488. # 带有联系人的电话
  2489. with_person = [person_phone[1] for person_phone in PackDict[k]["roleList"][i].linklist if person_phone[0]]
  2490. # 带有电话的联系人
  2491. with_phone = [person_phone[0] for person_phone in PackDict[k]["roleList"][i].linklist if person_phone[1]]
  2492. remove_list = []
  2493. for item in PackDict[k]["roleList"][i].linklist:
  2494. if not item[0]:
  2495. if item[1] in with_person:
  2496. # 删除重复的无联系人电话
  2497. remove_list.append(item)
  2498. elif not item[1]:
  2499. if item[0] in with_phone:
  2500. remove_list.append(item)
  2501. for _item in remove_list:
  2502. PackDict[k]["roleList"][i].linklist.remove(_item)
  2503. # 中标候选人联系方式异常排除
  2504. for k in PackDict.keys():
  2505. for i in range(len(PackDict[k]["roleList"])):
  2506. if PackDict[k]["roleList"][i].role_name in ['win_tenderer', 'second_tenderer','third_tenderer']:
  2507. if tenderee_agency_phone or tenderee_agency_contact:
  2508. remove_list = []
  2509. for item in PackDict[k]["roleList"][i].linklist:
  2510. if item[1] and item[1] in tenderee_agency_phone:
  2511. remove_list.append(item)
  2512. elif item[0]+'-'+item[1] in tenderee_agency_contact:
  2513. remove_list.append(item)
  2514. for _item in remove_list:
  2515. PackDict[k]["roleList"][i].linklist.remove(_item)
  2516. elif not tenderee_agency_phone:
  2517. # 公告中无招标代理联系方式时,可排除中标联系方式
  2518. remove_list = []
  2519. for _item in PackDict[k]["roleList"][i].linklist:
  2520. # 排除非正则规则识别的联系方式
  2521. if _item[1] not in rule_winter_phone:
  2522. remove_list.append(_item)
  2523. # print('remove_list',remove_list)
  2524. for _item in remove_list:
  2525. PackDict[k]["roleList"][i].linklist.remove(_item)
  2526. # PackDict更新company/org地址
  2527. last_role_prob = {}
  2528. for ent in pre_entity:
  2529. if ent.entity_type in ['company','org']:
  2530. if ent.pointer_address:
  2531. for k in PackDict.keys():
  2532. for i in range(len(PackDict[k]["roleList"])):
  2533. if PackDict[k]["roleList"][i].entity_text == ent.entity_text:
  2534. if not PackDict[k]["roleList"][i].address:
  2535. PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
  2536. last_role_prob[PackDict[k]["roleList"][i].role_name] = ent.values[role2id_dict[PackDict[k]["roleList"][i].role_name]]
  2537. else:
  2538. if PackDict[k]["roleList"][i].role_name in ['tenderee','agency']:
  2539. # 角色为招标/代理人时,取其实体概率高的链接地址作为角色address
  2540. if ent.values[role2id_dict[PackDict[k]["roleList"][i].role_name]] > last_role_prob[PackDict[k]["roleList"][i].role_name]:
  2541. PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
  2542. last_role_prob[PackDict[k]["roleList"][i].role_name] = ent.values[role2id_dict[PackDict[k]["roleList"][i].role_name]]
  2543. else:
  2544. if len(ent.pointer_address.entity_text) > len(PackDict[k]["roleList"][i].address):
  2545. PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
  2546. # 联系人——电子邮箱链接
  2547. temporary_list3 = [entity for entity in list_entity if entity.entity_type=='email' or (entity.entity_type=='person' and entity.label in [1,2,3])]
  2548. temporary_list3 = sorted(temporary_list3, key=lambda x: (x.sentence_index, x.begin_index))
  2549. new_temporary_list3 = []
  2550. for _split in new_split_list:
  2551. temp_list = []
  2552. for _entity in temporary_list3:
  2553. if words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[0] and words_num_dict[
  2554. _entity.sentence_index] + _entity.wordOffset_end < _split[1]:
  2555. temp_list.append(_entity)
  2556. elif words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[1]:
  2557. break
  2558. new_temporary_list3.append(temp_list)
  2559. # print(new_temporary_list3)
  2560. match_list3 = []
  2561. for split_index in range(len(new_temporary_list3)):
  2562. split_entitys = new_temporary_list3[split_index]
  2563. for index in range(len(split_entitys)):
  2564. entity = split_entitys[index]
  2565. if entity.entity_type == 'person':
  2566. match_nums = 0
  2567. for after_index in range(index + 1, min(len(split_entitys), index + 4)):
  2568. after_entity = split_entitys[after_index]
  2569. if match_nums > 2:
  2570. break
  2571. if after_entity.entity_type == 'email':
  2572. distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
  2573. tokens_num_dict[entity.sentence_index] + entity.end_index)
  2574. sentence_distance = after_entity.sentence_index - entity.sentence_index
  2575. if sentence_distance == 0:
  2576. if distance < 100:
  2577. if (entity.label == 0 and after_entity.label == 1) or (
  2578. entity.label == 1 and after_entity.label == 2):
  2579. distance = distance / 100
  2580. value = (-1 / 2 * (distance ** 2)) / 10000
  2581. match_list3.append(Match(entity, after_entity, value))
  2582. match_nums += 1
  2583. else:
  2584. if distance < 60:
  2585. if (entity.label == 0 and after_entity.label == 1) or (
  2586. entity.label == 1 and after_entity.label == 2):
  2587. distance = distance / 100
  2588. value = (-1 / 2 * (distance ** 2)) / 10000
  2589. match_list3.append(Match(entity, after_entity, value))
  2590. match_nums += 1
  2591. # 前向查找匹配
  2592. # if not match_nums:
  2593. if index != 0:
  2594. previous_entity = split_entitys[index - 1]
  2595. if previous_entity.entity_type == 'email':
  2596. if previous_entity.sentence_index == entity.sentence_index:
  2597. distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
  2598. tokens_num_dict[
  2599. previous_entity.sentence_index] + previous_entity.end_index)
  2600. if distance < 30:
  2601. # 距离相等时,前向添加处罚值
  2602. # distance += 1
  2603. # 前向 没有 /10000
  2604. value = (-1 / 2 * (distance ** 2))
  2605. match_list3.append(Match(entity, previous_entity, value))
  2606. # print(match_list3)
  2607. # km算法分配求解
  2608. result3 = dispatch(match_list3)
  2609. for match in result3:
  2610. match_person = match[0]
  2611. match_email = match[1]
  2612. match_person.pointer_email = match_email
  2613. # # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。
  2614. # temp_ent_list = [] # 临时列表,记录0,1角色及3联系人
  2615. # other_person = [] # 阈值以上的联系人列表
  2616. # link_person = [] # 有电话没联系上角色的person列表
  2617. # other_ent = []
  2618. # link_ent = []
  2619. # found_person = False
  2620. # ent_list = []
  2621. # for entity in list_entity:
  2622. # if entity.entity_type in ['org','company','person']:
  2623. # ent_list.append(entity)
  2624. # # ent_list = [entity for entity in list_entity if entity.entity_type in ['org','company','person']]
  2625. # #for list_index in range(len(ent_list)):
  2626. # #if ent_list[list_index].entity_type in ['org','company'] and ent_list[list_index].label == 0 and list_index+2<len(ent_list) and \
  2627. # #ent_list[list_index+1].entity_type in ['org','company'] and ent_list[list_index+1].label == 1 and ent_list[list_index+2].entity_type in ['person']:
  2628. # #ent_list[list_index+1], ent_list[list_index+2] = ent_list[list_index+2], ent_list[list_index+1]
  2629. # # 2020/11/25增加确定角色联系人判断
  2630. # sure_person_set = set([entity.entity_text for entity in ent_list if entity.entity_type == 'person' and entity.label in [1, 2]])
  2631. # # 招标/代理在同一句中交叉情况的处理
  2632. # for index in range(len(ent_list)):
  2633. # entity = ent_list[index]
  2634. # if entity.entity_text in roleSet and entity.label in [0, 1] and index+3<len(ent_list):
  2635. # if entity.sentence_index==ent_list[index+1].sentence_index==ent_list[index+2].sentence_index==ent_list[index+3].sentence_index:
  2636. # if ent_list[index+1].begin_index - entity.end_index < 30:
  2637. # if ent_list[index+1].entity_text in roleSet and ent_list[index+1].label in [0, 1] and entity.label!=ent_list[index+1].label:
  2638. # if ent_list[index+2].entity_type=="person" and ent_list[index+3].entity_type=="person" and \
  2639. # ent_list[index+2].label==3 and ent_list[index+3].label==3:
  2640. # ent_list[index + 1], ent_list[index + 2] = ent_list[index + 2], ent_list[index + 1]
  2641. #
  2642. #
  2643. # for index in range(len(ent_list)):
  2644. # entity = ent_list[index]
  2645. # if entity.entity_type=="person":
  2646. # if str(entity.label) == "0": # 2020/11/25 非联系人直接跳过
  2647. # continue
  2648. # if entity.values[entity.label]>on_value_person:
  2649. # if str(entity.label)=="1":
  2650. # for i in range(len(PackDict["Project"]["roleList"])):
  2651. # if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  2652. # PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  2653. # link_person.append(entity.entity_text)
  2654. # link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  2655. # # add pointer_person
  2656. # for _entity in list_entity:
  2657. # if dict_role_id.get(str(_entity.label))=="tenderee":
  2658. # for i in range(len(PackDict["Project"]["roleList"])):
  2659. # if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
  2660. # _entity.pointer_person = entity
  2661. # elif str(entity.label)=="2":
  2662. # for i in range(len(PackDict["Project"]["roleList"])):
  2663. # if PackDict["Project"]["roleList"][i].role_name=="agency":
  2664. # PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  2665. # link_person.append(entity.entity_text)
  2666. # link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  2667. # # add pointer_person
  2668. # for _entity in list_entity:
  2669. # if dict_role_id.get(str(_entity.label))=="agency":
  2670. # for i in range(len(PackDict["Project"]["roleList"])):
  2671. # if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
  2672. # _entity.pointer_person = entity
  2673. # elif str(entity.label)=="3":
  2674. # if entity.entity_text in sure_person_set: # 2020/11/25 排除已经确定角色的联系人
  2675. # continue
  2676. # #not_link_person.append((entity_after.entity_text,entity_after.person_phone))
  2677. # other_person.append(entity.entity_text)
  2678. # temp_ent_list.append((entity.entity_text,entity.person_phone,entity))
  2679. #
  2680. # #if entity.entity_text in roleSet:
  2681. # if entity.entity_text in roleSet:
  2682. # if entity.label in [0,1]:
  2683. # other_ent.append(entity.entity_text)
  2684. # temp_ent_list.append((entity.entity_text, entity.label,entity))
  2685. # for behind_index in range(index+1, len(ent_list)):
  2686. # entity_after = ent_list[behind_index]
  2687. # if entity_after.sentence_index-entity.sentence_index>=1 or entity_after.entity_type in ['org','company']: # 只在本句中找联系人
  2688. # break
  2689. # if entity_after.values is not None:
  2690. # if entity_after.entity_type=="person":
  2691. # if str(entity_after.label) == "0": # 2020/11/25角色后面为非联系人 停止继续往后找
  2692. # break
  2693. # if entity_after.values[entity_after.label]>on_value_person:
  2694. # if str(entity_after.label)=="1":
  2695. # for i in range(len(PackDict["Project"]["roleList"])):
  2696. # if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  2697. # PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  2698. # link_person.append(entity_after.entity_text)
  2699. # link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  2700. # elif str(entity_after.label)=="2":
  2701. # for i in range(len(PackDict["Project"]["roleList"])):
  2702. # if PackDict["Project"]["roleList"][i].role_name=="agency":
  2703. # PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  2704. # link_person.append(entity_after.entity_text)
  2705. # link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  2706. # elif str(entity_after.label)=="3":
  2707. # if entity_after.entity_text in sure_person_set: # 2020/11/25 如果姓名已经出现在确定角色联系人中则停止往后找
  2708. # break
  2709. # elif entity_after.begin_index - entity.end_index > 30:#2020/10/25 如果角色实体与联系人实体间隔大于阈值停止
  2710. # break
  2711. # for pack in PackDict.keys():
  2712. # for i in range(len(PackDict[pack]["roleList"])):
  2713. # if PackDict[pack]["roleList"][i].entity_text==entity.entity_text:
  2714. # #if entity_after.sentence_index-entity.sentence_index>1 and len(roleList[i].linklist)>0:
  2715. # #break
  2716. # PackDict[pack]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  2717. # link_person.append(entity_after.entity_text)
  2718. # #add pointer_person
  2719. # entity.pointer_person = entity_after
  2720. #
  2721. # not_link_person = [person for person in other_person if person not in link_person]
  2722. # not_link_ent = [ent for ent in other_ent if ent not in link_ent]
  2723. # if len(not_link_person) > 0 and len(not_link_ent) > 0 :
  2724. # item = temp_ent_list
  2725. # for i in range(len(item)):
  2726. # if item[i][0] in not_link_ent and item[i][1] == 0 and i+3 < len(item):
  2727. # if item[i+1][0] in other_ent and item[i+1][1] == 1 and item[i+2][0] in other_person and item[i+3][0] in other_person:
  2728. # item[i+1], item[i+2] = item[i+2], item[i+1]
  2729. # for i in range(len(item)-1, -1, -1):
  2730. # if item[i][0] in not_link_ent:
  2731. # for pack in PackDict.keys():
  2732. # for role in PackDict[pack]["roleList"]:
  2733. # if role.entity_text == item[i][0] and len(role.linklist) < 1:
  2734. # for j in range(i+1, len(item)):
  2735. # if item[j][0] in not_link_person:
  2736. # role.linklist.append(item[j][:2])
  2737. # #add pointer_person
  2738. # item[i][2].pointer_person = item[j][2]
  2739. # break
  2740. # else:
  2741. # break
  2742. # # 电话没有联系人的处理
  2743. # role_with_no_phone = []
  2744. # for i in range(len(PackDict["Project"]["roleList"])):
  2745. # if PackDict["Project"]["roleList"][i].role_name in ["tenderee","agency"]:
  2746. # if len(PackDict["Project"]["roleList"][i].linklist)==0: # 找出没有联系人的招标/代理人
  2747. # role_with_no_phone.append(PackDict["Project"]["roleList"][i].entity_text)
  2748. # else:
  2749. # phone_nums = 0
  2750. # for link in PackDict["Project"]["roleList"][i].linklist:
  2751. # if link[1]:
  2752. # phone_nums += 1
  2753. # break
  2754. # if not phone_nums:
  2755. # role_with_no_phone.append(PackDict["Project"]["roleList"][i].entity_text)
  2756. # if role_with_no_phone:
  2757. # phone_with_person = [entity.person_phone for entity in list_entity if entity.entity_type == "person"]
  2758. # # phone_with_person = [phone for phone in phone_with_person if phone]
  2759. #
  2760. # dict_index_sentence = {}
  2761. # for _sentence in list_sentence:
  2762. # dict_index_sentence[_sentence.sentence_index] = _sentence
  2763. # new_entity_list = [entity for entity in list_entity if entity.entity_type in ['org','company','person']]
  2764. # for index in range(len(new_entity_list)):
  2765. # entity = new_entity_list[index]
  2766. # if entity.entity_text in role_with_no_phone:
  2767. # e_sentence = dict_index_sentence[entity.sentence_index]
  2768. # entity_right = e_sentence.tokens[entity.end_index:entity.end_index+40]
  2769. # entity_right = "".join(entity_right)
  2770. # if index+1<len(new_entity_list) and entity_right.find(new_entity_list[index+1].entity_text)>-1:
  2771. # entity_right = entity_right[:entity_right.find(new_entity_list[index+1].entity_text)]
  2772. # have_phone = re.findall(phone,entity_right)
  2773. # if have_phone:
  2774. # _phone = have_phone[0]
  2775. # phone_begin = entity_right.find(_phone)
  2776. # if _phone not in phone_with_person and re.search(key_phone,entity_right[:phone_begin]):
  2777. # # entity.person_phone = _phone
  2778. # for i in range(len(PackDict["Project"]["roleList"])):
  2779. # if PackDict["Project"]["roleList"][i].entity_text == entity.entity_text:
  2780. # PackDict["Project"]["roleList"][i].linklist.append(('', _phone))
  2781. #寻找多标段招标金额
  2782. p_entity = len(list_entity)-1
  2783. set_tenderer_money = set()
  2784. list_tenderer_money = [] #2021/7/16 新增列表,倒序保存所有中标金额
  2785. unit_list = [] #2021/8/17 新增,保存金额单位
  2786. #遍历所有实体
  2787. max_prob = 0 # 保存招标金额最大概率
  2788. while(p_entity>=0):
  2789. entity = list_entity[p_entity]
  2790. if entity.entity_type=="money":
  2791. # 2021/12/03 添加成本警戒线、保证金
  2792. if entity.notes in ['保证金', '成本警戒线']:
  2793. packagePointer, _flag = getPackage(PackageList, entity.sentence_index, entity.begin_index,
  2794. "money-" + str(entity.label), MAX_DIS=2, DIRECT="L")
  2795. if packagePointer is None:
  2796. packageName = "Project"
  2797. else:
  2798. packageName = packagePointer.entity_text
  2799. if packageName == "Project":
  2800. # if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
  2801. # PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text))
  2802. if entity.notes=="保证金" and "bond" not in PackDict["Project"]:
  2803. PackDict["Project"]["bond"] = str(Decimal(entity.entity_text))
  2804. elif entity.notes=="成本警戒线" and "cost_warning" not in PackDict["Project"]:
  2805. PackDict["Project"]["cost_warning"] = str(Decimal(entity.entity_text))
  2806. else:
  2807. if entity.notes == "保证金" and "bond" not in PackDict[packageName]:
  2808. PackDict[packageName]["bond"] = str(Decimal(entity.entity_text))
  2809. elif entity.notes == "成本警戒线" and "cost_warning" not in PackDict[packageName]:
  2810. PackDict[packageName]["cost_warning"] = str(Decimal(entity.entity_text))
  2811. elif entity.values[entity.label]>=on_value:
  2812. if str(entity.label)=="1" and entity.notes != '单价':
  2813. set_tenderer_money.add(float(entity.entity_text))
  2814. list_tenderer_money.append(float(entity.entity_text)) # 2021/7/16 新增列表,倒序保存所有中标金额
  2815. unit_list.append(entity.money_unit)
  2816. # if str(entity.label)=="0":
  2817. if str(entity.label)=="0" and (entity.notes!='总投资' or float(entity.entity_text)<100000000):
  2818. '''
  2819. if p_entity>0:
  2820. p_before = list_entity[p_entity-1]
  2821. if p_before.entity_type=="money" and p_before.label==entity.label and p_before.entity_text==entity.entity_text and abs(entity.begin_index-p_before.end_index)<=2:
  2822. p_entity -= 1
  2823. continue
  2824. '''
  2825. packagePointer,_flag = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label),MAX_DIS=2,DIRECT="L")
  2826. if packagePointer is None:
  2827. packageName = "Project"
  2828. else:
  2829. packageName = packagePointer.entity_text
  2830. if packageName=="Project":
  2831. # if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
  2832. # PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text))
  2833. # if entity.values[entity.label]>on_value:
  2834. if entity.values[entity.label]>max_prob-0.005: # 选择最大概率招标金额 2024/05/23 相差0.005尽量选前面的
  2835. if entity.notes == '单价':
  2836. PackDict["Project"]["unit_tendereeMoney"] = str(Decimal(entity.entity_text))
  2837. else:
  2838. PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text))
  2839. PackDict["Project"]["tendereeMoneyUnit"] = entity.money_unit
  2840. max_prob = entity.values[entity.label]
  2841. else:
  2842. if entity.notes == '单价':
  2843. PackDict[packageName]["unit_tendereeMoney"] = str(Decimal(entity.entity_text))
  2844. else:
  2845. PackDict[packageName]["tendereeMoney"] = str(Decimal(entity.entity_text))
  2846. PackDict[packageName]["tendereeMoneyUnit"] = entity.money_unit
  2847. #add pointer_tendereeMoney
  2848. packagePointer.pointer_tendereeMoney = entity
  2849. p_entity -= 1
  2850. '''标段链接包名包号'''
  2851. pk_name_l = []
  2852. pk_code_l = []
  2853. count_dic = {
  2854. 'package': set(),
  2855. 'name': set(),
  2856. 'code': set()
  2857. }
  2858. def get_sort_dist(l, max_sent_dist=2):
  2859. '''
  2860. 计算标段与其他要素距离,并按距离排序返回字典
  2861. :param l: [(entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end)]
  2862. :param max_sent_dist: 最大句子距离
  2863. :return:
  2864. '''
  2865. l.sort(key=lambda x: [x[2],x[3],x[4]]) # 20241204 多个字段排序 修复 561998414 第一标段西铭矿清水泵采购 标段和包名开始位置一样的情况
  2866. link_dic = {}
  2867. i = 1
  2868. while i < len(l):
  2869. ty1, ent1, s1, b1, e1, in_att1 = l[i - 1]
  2870. ty2, ent2, s2, b2, e2, in_att2 = l[i]
  2871. if ty1 != ty2 and in_att1 == in_att2 and s2 - s1 <= max_sent_dist:
  2872. if ty1 == 'package':
  2873. if ent1 not in link_dic:
  2874. link_dic[ent1] = []
  2875. if s1 == s2:
  2876. dist = abs(b2 - e1) if b2 > e1 else 0
  2877. else:
  2878. dist = len(list_sentence[s1].sentence_text) - e1
  2879. for id in range(s1+1, s2):
  2880. dist += len(list_sentence[id].sentence_text)
  2881. dist += b2
  2882. if in_att1:
  2883. dist += 100 # 附件的距离加100
  2884. link_dic[ent1].append((s2 - s1, dist, ent2))
  2885. elif ty2 == 'package':
  2886. if ent2 not in link_dic:
  2887. link_dic[ent2] = []
  2888. if s1 == s2:
  2889. dist = abs(b2 - e1) if b2 > e1 else 0
  2890. else:
  2891. dist = len(list_sentence[s1].sentence_text) - e1
  2892. for id in range(s1+1, s2):
  2893. dist += len(list_sentence[id].sentence_text)
  2894. dist += b2
  2895. if in_att1:
  2896. dist += 100 # 附件的距离加100
  2897. if s1!=s2 or e1!=e2:
  2898. dist += 30 # 包号在实体后面距离再加30
  2899. link_dic[ent2].append((s2 - s1, dist, ent1))
  2900. i += 1
  2901. return link_dic
  2902. for entity in list_entity:
  2903. if entity.entity_type == 'package':
  2904. pk_name_l.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end, entity.in_attachment))
  2905. pk_code_l.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end, entity.in_attachment))
  2906. count_dic['package'].add(entity.entity_text)
  2907. elif entity.entity_type == 'name':
  2908. pk_name_l.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end, entity.in_attachment))
  2909. count_dic['name'].add(entity.entity_text)
  2910. elif entity.entity_type == 'code':
  2911. pk_code_l.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end, entity.in_attachment))
  2912. count_dic['code'].add(entity.entity_text)
  2913. if len(count_dic['package']) > 0:
  2914. if len(count_dic['name'])>0:
  2915. link_dic = get_sort_dist(pk_name_l)
  2916. for k, v in link_dic.items():
  2917. v.sort(key=lambda x: [x[0], x[1]])
  2918. if v[0][0] < 2 and v[0][1] < 200: # 标段号与包名句子数小于2,字距离小于200的才添加
  2919. PackDict[k]["name"] = v[0][2]
  2920. if len(count_dic['code'])>0:
  2921. link_dic = get_sort_dist(pk_code_l)
  2922. for k, v in link_dic.items():
  2923. v.sort(key=lambda x: [x[0], x[1]])
  2924. if v[0][0] < 2 and v[0][1] < 200:
  2925. PackDict[k]["code"] = v[0][2]
  2926. #删除一个机构有多个角色的数据
  2927. #删除重复人、概率不回传
  2928. final_roleList = []
  2929. list_pop = []
  2930. set_tenderer_role = set()
  2931. dict_pack_tenderer_money = dict()
  2932. for pack in PackDict.keys():
  2933. #删除无效包
  2934. if PackDict[pack]["code"]=="" and PackDict[pack]["tendereeMoney"]==0 and len(PackDict[pack]["roleList"])==0:
  2935. list_pop.append(pack)
  2936. for i in range(len(PackDict[pack]["roleList"])):
  2937. if PackDict[pack]["roleList"][i].role_name=="win_tenderer":
  2938. if PackDict[pack]["roleList"][i].money==0:
  2939. set_tenderer_role.add(PackDict[pack]["roleList"][i])
  2940. dict_pack_tenderer_money[pack] = [PackDict[pack]["roleList"][i],set()]
  2941. #找到包的中投标金额
  2942. for _index in range(len(PackageList)):
  2943. if "hit" in PackageList[_index]:
  2944. for _hit in list(PackageList[_index]["hit"]):
  2945. if len(_hit.split("-"))==3:
  2946. _money = float(_hit.split("-")[1]) if _hit.split("-")[0]=="money" else None
  2947. # 补充金额前新增负号‘-’导致错误的规则
  2948. elif len(_hit.split("-"))==4:
  2949. _money = float(_hit.split("-")[2]) if _hit.split("-")[0] == "money" else None
  2950. else:
  2951. _money = None
  2952. if PackageList[_index]["name"] in dict_pack_tenderer_money and _money is not None:
  2953. dict_pack_tenderer_money[PackageList[_index]["name"]][1].add(_money)
  2954. #只找到一个中标人和中标金额
  2955. if len(set_tenderer_money)==1 and len(set_tenderer_role)==1:
  2956. list(set_tenderer_role)[0].money = list(set_tenderer_money)[0]
  2957. list(set_tenderer_role)[0].money_unit = unit_list[0]
  2958. # print('一个中标人一个金额:', list(set_tenderer_money)[0])
  2959. #找到一个中标人和多个招标金额
  2960. if len(set_tenderer_money)>1 and len(set_tenderer_role)==1:
  2961. _maxMoney = list(set_tenderer_money)[0]
  2962. _sumMoney = 0
  2963. for _m in list(set_tenderer_money):
  2964. _sumMoney += _m
  2965. if _m>_maxMoney:
  2966. _maxMoney = _m
  2967. if _sumMoney/_maxMoney==2:
  2968. list(set_tenderer_role)[0].money = _maxMoney
  2969. # print('一人多金额分项合计 取最大金额:', _maxMoney)
  2970. else:
  2971. # list(set_tenderer_role)[0].money = _maxMoney
  2972. if min(list_tenderer_money)>200000 and list_tenderer_money[-1]/min(list_tenderer_money)>9000:
  2973. list(set_tenderer_role)[0].money = min(list_tenderer_money)
  2974. list(set_tenderer_role)[0].money_unit = unit_list[list_tenderer_money.index(min(list_tenderer_money))]
  2975. # print('一人多金额 且最小的大于20万第一个金额比最小金额大几千倍的最小中标金额:', min(list_tenderer_money))
  2976. else:
  2977. list(set_tenderer_role)[0].money = list_tenderer_money[-1] # 2021/7/16 修改 不是单价合计方式取第一个中标金额
  2978. list(set_tenderer_role)[0].money_unit = unit_list[-1] # 金额单位
  2979. # print('一人多金额 取第一个中标金额:', list_tenderer_money[-1])
  2980. #每个包都只找到一个金额
  2981. _flag_pack_money = True
  2982. for k,v in dict_pack_tenderer_money.items():
  2983. if len(v[1])!=1:
  2984. _flag_pack_money = False
  2985. if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()):
  2986. for k,v in dict_pack_tenderer_money.items():
  2987. if float(v[0].unit_price) < float(list(v[1])[0]): # 20241128 金额大于单价时才作链接金额
  2988. v[0].money = list(v[1])[0]
  2989. # 2021/7/16 #增加判断中标金额是否远大于招标金额逻辑
  2990. for pack in PackDict.keys():
  2991. for i in range(len(PackDict[pack]["roleList"])):
  2992. if float(PackDict[pack]["tendereeMoney"]) > 0:
  2993. # print('金额数据类型:',type(PackDict[pack]["roleList"][i].money))
  2994. if float(PackDict[pack]["roleList"][i].money) >10000000 and \
  2995. float(PackDict[pack]["roleList"][i].money)/float(PackDict[pack]["tendereeMoney"])>=1000:
  2996. PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
  2997. # print('招标金额校正中标金额')
  2998. # 2022/04/01 #增加判断中标金额是否远小于招标金额逻辑,比例相差10000倍左右(中标金额“万”单位丢失或未识别)
  2999. for pack in PackDict.keys():
  3000. for i in range(len(PackDict[pack]["roleList"])):
  3001. if float(PackDict[pack]["tendereeMoney"]) > 0 and float(PackDict[pack]["roleList"][i].money) > 0.:
  3002. if float(PackDict[pack]["roleList"][i].money) < 1000 and \
  3003. float(PackDict[pack]["tendereeMoney"])/float(PackDict[pack]["roleList"][i].money)>=9995 and \
  3004. float(PackDict[pack]["tendereeMoney"])/float(PackDict[pack]["roleList"][i].money)<11000:
  3005. PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) * 10000
  3006. # 2021/7/19 #增加判断中标金额是否远大于第二三中标金额
  3007. for pack in PackDict.keys():
  3008. tmp_moneys = []
  3009. for i in range(len(PackDict[pack]["roleList"])):
  3010. if float(PackDict[pack]["roleList"][i].money) >100000:
  3011. tmp_moneys.append(float(PackDict[pack]["roleList"][i].money))
  3012. if len(tmp_moneys)>2 and max(tmp_moneys)/min(tmp_moneys)>1000:
  3013. for i in range(len(PackDict[pack]["roleList"])):
  3014. if float(PackDict[pack]["roleList"][i].money)/min(tmp_moneys)>1000:
  3015. PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
  3016. # print('通过其他中标人投标金额校正中标金额')
  3017. for item in list_pop:
  3018. PackDict.pop(item)
  3019. # 公告中只有"招标人"且无"联系人"链接时
  3020. if len(PackDict)==1:
  3021. k = list(PackDict.keys())[0]
  3022. tenderee_agency_role = [role for role in PackDict[k]["roleList"] if role.role_name in ['tenderee','agency','win_tenderer']]
  3023. if len(tenderee_agency_role)==1:
  3024. exist_person = []
  3025. exist_phone = []
  3026. for role in PackDict[k]["roleList"]:
  3027. for group in role.linklist:
  3028. if group[0]:
  3029. exist_person.append(group[0])
  3030. if group[1]:
  3031. exist_phone.append(group[1])
  3032. if tenderee_agency_role[0].role_name == "tenderee":
  3033. if not tenderee_agency_role[0].linklist:
  3034. get_contacts = False
  3035. if not get_contacts:
  3036. # 根据大纲Outline类召回联系人
  3037. for outline in list_outline:
  3038. if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系",outline.outline_summary) and \
  3039. not re.search("代理|乙方|竞得|受让|买受|签约|供货|供应|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租(?:(包))?|入围|入选|竞买|中标|中选|中价|中签|成交|候选",outline.outline_summary):
  3040. for t_person in [p for p in temporary_list2 if p.entity_type=='person' and p.label==3]:
  3041. if words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= words_num_dict[outline.sentence_begin_index] + outline.wordOffset_begin and words_num_dict[
  3042. t_person.sentence_index] + t_person.wordOffset_end < words_num_dict[outline.sentence_end_index] + outline.wordOffset_end:
  3043. if t_person.person_phone:
  3044. _phone = [p.entity_text for p in t_person.person_phone]
  3045. for _p in _phone:
  3046. if t_person.entity_text not in exist_person and _p not in ",".join(exist_phone):
  3047. tenderee_agency_role[0].linklist.append((t_person.entity_text, _p))
  3048. get_contacts = True
  3049. break
  3050. elif words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= \
  3051. words_num_dict[outline.sentence_end_index] + outline.wordOffset_end:
  3052. break
  3053. if not get_contacts:
  3054. sentence_phone = phone.findall(outline.outline_text)
  3055. if sentence_phone:
  3056. if sentence_phone[0] not in ",".join(exist_phone):
  3057. tenderee_agency_role[0].linklist.append(("", sentence_phone[0]))
  3058. get_contacts = True
  3059. break
  3060. if not get_contacts:
  3061. # 直接取文中倒数第一个联系人
  3062. for _entity in temporary_list2[::-1]:
  3063. if _entity.entity_type=='person' and _entity.label==3:
  3064. if _entity.person_phone:
  3065. _phone = [p.entity_text for p in _entity.person_phone]
  3066. for _p in _phone:
  3067. if _entity.entity_text not in exist_person and _p not in ",".join(exist_phone):
  3068. tenderee_agency_role[0].linklist.append((_entity.entity_text, _p))
  3069. get_contacts = True
  3070. break
  3071. if not get_contacts:
  3072. # 如果文中只有一个“phone”实体,则直接取为联系人电话
  3073. if len(phone_entitys) == 1:
  3074. if phone_entitys[0].entity_text not in ",".join(exist_phone):
  3075. tenderee_agency_role[0].linklist.append(("", phone_entitys[0].entity_text))
  3076. get_contacts = True
  3077. if not get_contacts:
  3078. # 通过大纲Outline类直接取电话
  3079. if len(new_split_list) > 1:
  3080. for _start, _end in new_split_list:
  3081. temp_sentence = _content[_start:_end]
  3082. sentence_outline = temp_sentence.split(",::")[0]
  3083. if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系", sentence_outline) and \
  3084. not re.search("代理|乙方|竞得|受让|买受|签约|供货|供应|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租(?:(包))?|入围|入选|竞买|中标|中选|中价|中签|成交|候选",sentence_outline):
  3085. sentence_phone = phone.findall(temp_sentence)
  3086. if sentence_phone:
  3087. if sentence_phone[0] in [ent.entity_text for ent in phone_entitys] and sentence_phone[0] not in ",".join(exist_phone):
  3088. tenderee_agency_role[0].linklist.append(("", sentence_phone[0]))
  3089. get_contacts = True
  3090. break
  3091. if not get_contacts:
  3092. # 通过正则提取句子段落进行提取电话
  3093. contacts_person = "(?:联系人|联系方|联系方式|负责人|电话|联系电话)[::]?"
  3094. tenderee_pattern = "(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主|业主单位)[^。]{0,5}"
  3095. contact_pattern_list = [tenderee_pattern + contacts_person,
  3096. "(?:采购[^。,]{0,2}项目|采购事项|招标)[^。,]{0,4}" + contacts_person,
  3097. "(?:项目|采购)[^。,]{0,4}" + contacts_person,
  3098. "(?:报名|报价|业务咨询|业务|投标咨询)[^。,]{0,4}" + contacts_person, ]
  3099. for _pattern in contact_pattern_list:
  3100. get_tenderee_contacts = False
  3101. for regular_match in re.finditer(_pattern, _content):
  3102. match_text = _content[regular_match.end():regular_match.end() + 50]
  3103. match_text = match_text.split("。")[0]
  3104. sentence_phone = phone.findall(match_text)
  3105. if sentence_phone:
  3106. if sentence_phone[0] not in ",".join(exist_phone):
  3107. tenderee_agency_role[0].linklist.append(("", sentence_phone[0]))
  3108. get_tenderee_contacts = True
  3109. break
  3110. if get_tenderee_contacts:
  3111. break
  3112. # 如果同一个电话连到了不同的单位就直接去掉(2024-09-03 新增)
  3113. get_phone_dict = dict()
  3114. for k in PackDict.keys():
  3115. for i in range(len(PackDict[k]["roleList"])):
  3116. for item in PackDict[k]["roleList"][i].linklist:
  3117. if item[1]:
  3118. if item[1] not in get_phone_dict:
  3119. get_phone_dict[item[1]] = set()
  3120. get_phone_dict[item[1]].add(PackDict[k]["roleList"][i].entity_text)
  3121. # print(get_phone_dict)
  3122. remove_phone = []
  3123. for phone,role_list in get_phone_dict.items():
  3124. if len(role_list)>1:
  3125. remove_phone.append(phone)
  3126. for k in PackDict.keys():
  3127. for i in range(len(PackDict[k]["roleList"])):
  3128. remove_list = []
  3129. for item in PackDict[k]["roleList"][i].linklist:
  3130. if item[1] and item[1] in remove_phone:
  3131. remove_list.append(item)
  3132. for _item in remove_list:
  3133. PackDict[k]["roleList"][i].linklist.remove(_item)
  3134. for pack in PackDict.keys():
  3135. for i in range(len(PackDict[pack]["roleList"])):
  3136. PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
  3137. return PackDict
  3138. def initPackageAttr(RoleList,PackageSet,win_tenderer_set,tenderee_or_agency_set, main_body_pack):
  3139. '''
  3140. @summary: 根据拿到的roleList和packageSet初始化接口返回的数据
  3141. '''
  3142. packDict = dict()
  3143. packDict["Project"] = {"code":"","tendereeMoney":0,"roleList":[], 'tendereeMoneyUnit':''}
  3144. for item in list(PackageSet):
  3145. packDict[item] = {"code":"","tendereeMoney":0,"roleList":[], 'tendereeMoneyUnit':''}
  3146. packDict[item]['in_attachment'] = False if item in main_body_pack else True
  3147. for item in RoleList:
  3148. if packDict[item.packageName]["code"] =="":
  3149. packDict[item.packageName]["code"] = item.packageCode
  3150. # packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[]))
  3151. # packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[])) #Role(角色名称,实体名称,角色阈值,金额,金额阈值,连接列表,金额单位)
  3152. packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,item.role_prob,0,0.0,[],set(item.multi_winner)-win_tenderer_set-tenderee_or_agency_set)) #Role(角色名称,实体名称,角色阈值,金额,金额阈值,连接列表,多中标人)
  3153. return packDict
  3154. def getPackageRoleMoney(list_sentence,list_entity,list_outline,winter_scope):
  3155. '''
  3156. @param:
  3157. list_sentence:文章的句子list
  3158. list_entity:文章的实体list
  3159. @return: 拿到文章的包-标段号-角色-实体名称-金额-联系人-联系电话
  3160. '''
  3161. # print("=1")
  3162. theRole = getRoleList(list_sentence,list_entity)
  3163. if not theRole:
  3164. return []
  3165. # RoleList,RoleSet,PackageList,PackageSet = theRole
  3166. RoleList,RoleSet,PackageList,PackageSet,win_tenderer_set,tenderee_or_agency_set,main_body_pack = theRole
  3167. '''
  3168. for item in PackageList:
  3169. # print(item)
  3170. '''
  3171. # PackDict = initPackageAttr(RoleList, PackageSet)
  3172. PackDict = initPackageAttr(RoleList, PackageSet, win_tenderer_set,tenderee_or_agency_set,main_body_pack)
  3173. PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_sentence, list_entity, list_outline, winter_scope)
  3174. return PackDict
  3175. def turnBidWay(bidway):
  3176. if bidway in ("邀请招标","采购方式:邀请"):
  3177. return "邀请招标"
  3178. elif bidway in ("询价","询单","询比","采购方式:询价"):
  3179. return "询价"
  3180. elif bidway in ("竞谈","竞争性谈判","公开竞谈"):
  3181. return "竞争性谈判"
  3182. elif bidway in ("竞争性磋商","磋商"):
  3183. return "竞争性磋商"
  3184. elif bidway in ("竞价","竞标","电子竞价","以电子竞价","电子书面竞投"):
  3185. return "竞价"
  3186. elif bidway in ("公开招标","网上电子投标","网上招标","采购方式:公开","招标为其他"):
  3187. return "公开招标"
  3188. elif bidway in ("单一来源"):
  3189. return "单一来源"
  3190. elif bidway in ("比选"):
  3191. return "比选"
  3192. else:
  3193. return "其他"
  3194. def turnMoneySource(moneysource):
  3195. result_list = []
  3196. if re.search("自筹|业主筹集|筹资|自有",moneysource):
  3197. result_list.append("自筹")
  3198. if re.search("财政",moneysource) and not re.search("非财政",moneysource):
  3199. result_list.append("财政资金")
  3200. if re.search("拨款|补助|划拨|拨付|国拨|上级资金",moneysource):
  3201. result_list.append("上级拨款")
  3202. if re.search("社会资本|社会资金",moneysource):
  3203. result_list.append("社会资本")
  3204. if re.search("贷款|借款|借贷",moneysource):
  3205. result_list.append("贷款资金")
  3206. if re.search("债券|债|国债",moneysource):
  3207. result_list.append("债券资金")
  3208. if re.search("专项|项目资金",moneysource):
  3209. result_list.append("项目专项资金")
  3210. if re.search("配套",moneysource):
  3211. result_list.append("配套资金")
  3212. if re.search("外资",moneysource):
  3213. result_list.append("外资")
  3214. if re.search("国有资金|国企资金|国资|国家投资",moneysource):
  3215. result_list.append("国有资金")
  3216. if re.search("投资|融资",moneysource):
  3217. result_list.append("投资资金")
  3218. if re.search("预算(?<!外)|预算内",moneysource):
  3219. result_list.append("预算内资金")
  3220. if re.search("预算外",moneysource):
  3221. result_list.append("预算外资金")
  3222. result_list = sorted(result_list,key = lambda x:x)
  3223. if len(result_list)>0 and len(result_list)<5:
  3224. return ",".join(result_list)
  3225. else:
  3226. return "其他资金"
  3227. my_time_format_pattern = re.compile("((?:(?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*)?(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
  3228. from BiddingKG.dl.ratio.re_ratio import getUnifyNum
  3229. def my_timeFormat(_time,page_time):
  3230. if page_time:
  3231. current_year = time.strftime("%Y",time.localtime(int(datetime.strptime(page_time, '%Y-%m-%d').timestamp())))
  3232. else:
  3233. current_year = time.strftime("%Y",time.localtime())
  3234. all_match = re.finditer(my_time_format_pattern,_time)
  3235. time_list = []
  3236. idx = 0
  3237. global_year = ""
  3238. for _match in all_match:
  3239. if len(_match.group())>0:
  3240. idx += 1
  3241. legal = True
  3242. year = ""
  3243. month = ""
  3244. day = ""
  3245. for k,v in _match.groupdict().items():
  3246. if k=="year":
  3247. year = v
  3248. if k=="month":
  3249. month = v
  3250. if k=="day":
  3251. day = v
  3252. if year!="":
  3253. if year==None: # 例:5月18日
  3254. if idx==2 and global_year: # 例:2025年5月14日-5月18日,第二个时间没年份
  3255. year = global_year
  3256. else:
  3257. legal = False
  3258. else:
  3259. if re.search("^\d+$", year):
  3260. if len(year) == 2:
  3261. year = "20" + year
  3262. if int(year) - int(current_year) > 5 or int(year) - int(current_year) < -1:
  3263. legal = False
  3264. else:
  3265. if int(year) - int(current_year)>10 or int(year) - int(current_year) < -1:
  3266. legal = False
  3267. else:
  3268. _year = ""
  3269. for word in year:
  3270. if word == '0':
  3271. _year += word
  3272. else:
  3273. _year += str(getDigitsDic(word))
  3274. year = _year
  3275. else:
  3276. legal = False
  3277. if month!="":
  3278. if re.search("^\d+$", month):
  3279. if int(month) > 12:
  3280. legal = False
  3281. else:
  3282. month = int(getUnifyNum(month))
  3283. if month >= 1 and month <= 12:
  3284. month = str(month)
  3285. else:
  3286. legal = False
  3287. else:
  3288. legal = False
  3289. if day!="":
  3290. if re.search("^\d+$", day):
  3291. if int(day) > 31:
  3292. legal = False
  3293. else:
  3294. day = int(getUnifyNum(day))
  3295. if day >= 1 and day <= 31:
  3296. day = str(day)
  3297. else:
  3298. legal = False
  3299. else:
  3300. legal = False
  3301. if legal and not isValidDate(int(year),int(month),int(day)):
  3302. legal = False
  3303. if legal:
  3304. # 数字字符格式化
  3305. year = str(int(year))
  3306. month = str(int(month))
  3307. day = str(int(day))
  3308. time_list.append("%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0")))
  3309. if idx==1 and not global_year:
  3310. global_year = year
  3311. return time_list
  3312. def getTimeAttributes(list_entity,list_sentence,page_time):
  3313. # from BiddingKG.dl.interface.htmlparser import get_childs
  3314. # document_tree = parse_document.tree
  3315. # new_document_tree = []
  3316. # _data_i = -1
  3317. # while _data_i < len(document_tree) - 1:
  3318. # _data_i += 1
  3319. # _data = document_tree[_data_i]
  3320. # _type = _data["type"]
  3321. # if _type == "sentence":
  3322. # if _data["sentence_title"] is not None:
  3323. # new_document_tree.append(_data)
  3324. # document_tree = new_document_tree
  3325. time_entitys = [i for i in list_entity if i.entity_type=='time']
  3326. time_entitys = sorted(time_entitys,key=lambda x:(x.sentence_index, x.begin_index))
  3327. list_sentence = sorted(list_sentence,key=lambda x:x.sentence_index)
  3328. dict_time = {
  3329. "time_release": [], # 1 发布时间
  3330. "time_bidopen": [], # 2 开标时间
  3331. "time_bidclose": [], # 3 截标时间
  3332. 'time_bidstart': [], # 12 投标(开始)时间、响应文件接收(开始)时间
  3333. 'time_publicityStart': [], # 4 公示开始时间(公示时间、公示期)
  3334. 'time_publicityEnd': [], # 5 公示截止时间
  3335. 'time_getFileStart': [], # 6 文件获取开始时间(文件获取时间)
  3336. 'time_getFileEnd': [], # 7 文件获取截止时间
  3337. 'time_registrationStart': [], # 8 报名开始时间(报名时间)
  3338. 'time_registrationEnd': [], # 9 报名截止时间
  3339. 'time_earnestMoneyStart': [], #10 保证金递交开始时间(保证金递交时间)
  3340. 'time_earnestMoneyEnd': [] , # 11 保证金递交截止时间
  3341. 'time_commencement':[] , #13 开工日期
  3342. 'time_completion': [], # 14 竣工日期
  3343. 'time_listingStart': [], # 15 挂牌开始日期(挂牌时间)
  3344. 'time_listingEnd': [], # 16 挂牌结束日期、挂牌截止日期
  3345. 'time_signContract': [], # 17 合同签订时间
  3346. 'time_contractStart': [], # 18 合同开始时间
  3347. 'time_contractEnd': [] # 19 合同结束时间
  3348. }
  3349. dict_time2label = {
  3350. "time_release": 1, # 1 发布时间
  3351. "time_bidopen": 2, # 2 开标时间
  3352. "time_bidclose": 3, # 3 截标时间
  3353. 'time_bidstart': 12, # 12 投标(开始)时间、响应文件接收(开始)时间
  3354. 'time_publicityStart': 4, # 4 公示开始时间(公示时间、公示期)
  3355. 'time_publicityEnd': 5, # 5 公示截止时间
  3356. 'time_getFileStart': 6, # 6 文件获取开始时间(文件获取时间)
  3357. 'time_getFileEnd': 7, # 7 文件获取截止时间
  3358. 'time_registrationStart': 8, # 8 报名开始时间(报名时间)
  3359. 'time_registrationEnd': 9, # 9 报名截止时间
  3360. 'time_earnestMoneyStart': 10, # 10 保证金递交开始时间(保证金递交时间)
  3361. 'time_earnestMoneyEnd': 11, # 11 保证金递交截止时间
  3362. 'time_commencement': 13, # 13 开工日期
  3363. 'time_completion': 14, # 14 竣工日期
  3364. 'time_listingStart': 15, # 15 挂牌开始日期(挂牌时间)
  3365. 'time_listingEnd': 16, # 16 挂牌结束日期、挂牌截止日期
  3366. 'time_signContract': 17, # 17 合同签订时间
  3367. 'time_contractStart': 18, # 18 合同开始时间
  3368. 'time_contractEnd': 19 # 19 合同结束时间
  3369. }
  3370. last_sentence_index = 0
  3371. last_time_type = ""
  3372. last_time_index = {
  3373. 'time_bidstart':"time_bidclose",
  3374. 'time_publicityStart':"time_publicityEnd",
  3375. 'time_getFileStart':"time_getFileEnd",
  3376. 'time_registrationStart':"time_registrationEnd",
  3377. 'time_earnestMoneyStart':"time_earnestMoneyEnd",
  3378. 'time_commencement':"time_completion",
  3379. 'time_listingStart':"time_listingEnd",
  3380. 'time_contractStart':"time_contractEnd"
  3381. }
  3382. time_entitys = [[_entity,my_timeFormat(_entity.entity_text,page_time)] for _entity in time_entitys]
  3383. time_entitys = [item for item in time_entitys if item[1]]
  3384. # print(time_entitys)
  3385. for entity_idx in range(len(time_entitys)):
  3386. entity = time_entitys[entity_idx][0]
  3387. extract_time = time_entitys[entity_idx][1]
  3388. sentence_text = list_sentence[entity.sentence_index].sentence_text
  3389. previous_entity = time_entitys[entity_idx-1][0] if entity_idx!=0 else None
  3390. previous_extract_time = time_entitys[entity_idx-1][1] if entity_idx!=0 else None
  3391. next_entity = time_entitys[entity_idx+1][0] if entity_idx!=len(time_entitys)-1 else None
  3392. next_extract_time = time_entitys[entity_idx+1][1] if entity_idx!=len(time_entitys)-1 else None
  3393. # 实体有效上下文
  3394. entity_context_begin = previous_entity.wordOffset_end if previous_entity and previous_entity.sentence_index==entity.sentence_index else 0
  3395. entity_context_end = next_entity.wordOffset_begin if next_entity and next_entity.sentence_index==entity.sentence_index else len(sentence_text)
  3396. if entity.sentence_index!=last_sentence_index:
  3397. # sentence_index 不同句子重置last_time_type
  3398. last_time_type = ""
  3399. entity_left = sentence_text[max(entity_context_begin, entity.wordOffset_begin - 2):entity.wordOffset_begin]
  3400. entity_left2 = sentence_text[max(entity_context_begin, entity.wordOffset_begin - 10):entity.wordOffset_begin]
  3401. entity_left3 = sentence_text[max(entity_context_begin, entity.wordOffset_begin - 30):entity.wordOffset_begin]
  3402. entity_right = sentence_text[entity.wordOffset_end:min(entity.wordOffset_end + 3,entity_context_end)]
  3403. entity_right2 = sentence_text[entity.wordOffset_end:entity_context_end]
  3404. entity_right2 = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",'',entity_right2)[:60] # 去除网址
  3405. # print(entity.entity_text,entity_right2)
  3406. label_prob = entity.values[entity.label]
  3407. entity_text = entity.entity_text
  3408. in_attachment = entity.in_attachment
  3409. # extract_time = my_timeFormat(entity_text,page_time)
  3410. # print(entity_text,entity_left2)
  3411. if extract_time:
  3412. definite_time_list = []
  3413. t = re.compile("(北京时间)?(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[::时点](?P<half_hour>半)?(?P<minute>\d{1,2})?[::分]?(?P<second>\d{2})?秒?")
  3414. _entity_text = re.sub(" (?=[^\d])|(?<=[^\d]) ","",entity_text)
  3415. _entity_text_len = len(_entity_text)
  3416. _entity_text = _entity_text + sentence_text[entity.wordOffset_end:entity.wordOffset_end+20]
  3417. t_in_word_num = len(re.findall(t,_entity_text))
  3418. # t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,re.sub(" (?=[^\d])|(?<=[^\d]) ","",sentence_text[entity.wordOffset_end:]))
  3419. begin_index = 0
  3420. definite_time_idx_list = []
  3421. for _num in range(t_in_word_num):
  3422. if begin_index> _entity_text_len + 8:
  3423. break
  3424. t_in_word = re.search(t, _entity_text[begin_index:])
  3425. # print(_entity_text[begin_index:])
  3426. if t_in_word:
  3427. if _num==0 and t_in_word.start() > _entity_text_len + 8:
  3428. break
  3429. begin_index += t_in_word.end()
  3430. # print('t_in_word',entity_text,t_in_word.groupdict())
  3431. day = t_in_word.groupdict().get('day',"")
  3432. hour = t_in_word.groupdict().get('hour',"")
  3433. half_hour = t_in_word.groupdict().get('half_hour',"")
  3434. minute = t_in_word.groupdict().get('minute',"")
  3435. second = t_in_word.groupdict().get('second',"")
  3436. if hour:
  3437. if day=='下午' and int(hour)<12:
  3438. hour = str(int(hour)+12)
  3439. if int(hour)>24:
  3440. continue
  3441. else:
  3442. hour = "00"
  3443. if not minute:
  3444. if half_hour:
  3445. minute = "30"
  3446. else:
  3447. minute = "00"
  3448. if int(minute)>60:
  3449. continue
  3450. if not second:
  3451. second = "00"
  3452. if int(second)>60:
  3453. continue
  3454. definite_time = "%s:%s:%s"%(hour.rjust(2,"0"),minute.rjust(2,"0"),second.rjust(2,"0"))
  3455. # print(definite_time)
  3456. definite_time_list.append(definite_time)
  3457. definite_time_idx_list.append([begin_index-len(t_in_word.group()),begin_index])
  3458. if len(extract_time)==1 and len(definite_time_list)>=2: # 实体只包含一个时间,"2024-12-09 09:00~16:00" 考虑单个时间对应两个详细时间段的识别
  3459. # 前两个详细时间的间隔
  3460. distance = definite_time_idx_list[1][0] - definite_time_idx_list[0][1]
  3461. if distance<=8 and int(definite_time_list[1][:2])>=int(definite_time_list[0][:2]): # 判断详细时间都‘小时’顺序从小到大
  3462. new_extract_time = []
  3463. for d_time in definite_time_list[:2]:
  3464. if d_time == "24:00:00": # 修正不规范时间表述
  3465. d_time = "23:59:59"
  3466. new_extract_time.append(extract_time[0] + " " + d_time)
  3467. extract_time = new_extract_time
  3468. else:
  3469. if definite_time_list[0] == "24:00:00": # 修正不规范时间表述
  3470. definite_time_list[0] = "23:59:59"
  3471. if definite_time_list[0] != "00:00:00":
  3472. extract_time[0] = extract_time[0] + " " + definite_time_list[0]
  3473. else:
  3474. min_len = min(len(extract_time),len(definite_time_list))
  3475. for i in range(min_len):
  3476. if definite_time_list[i] == "24:00:00": # 修正不规范时间表述
  3477. definite_time_list[i] = "23:59:59"
  3478. if definite_time_list[i] != "00:00:00":
  3479. extract_time[i] = extract_time[i] + " " + definite_time_list[i]
  3480. if extract_time:
  3481. # 时间变更prob优化
  3482. if re.search("原",entity_left2):
  3483. last_index = 0
  3484. for item in re.finditer("原",entity_left2):
  3485. last_index = item.start() + 1
  3486. label_prob = label_prob - 0.2 * last_index / len(entity_left2)
  3487. # print('prob优化',label_prob,extract_time)
  3488. elif re.search("改正|更正|修正|更改|延期",entity_left2):
  3489. new_label = dict_time2label.get(last_time_type,None)
  3490. if new_label and entity.label==0:
  3491. entity.label = new_label
  3492. label_prob = 1
  3493. # 优化多个并列的时间,如:开标时间和截标时间,截标时间和报名结束时间
  3494. if entity.label in [2,3,9]:
  3495. if entity.label==2 and re.search("截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止|文件.{,2}([递提]交|接收)",entity_left3):
  3496. dict_time['time_bidclose'].append((extract_time[0], label_prob-0.1, in_attachment))
  3497. if entity.label==3 and re.search("开标|(评审|比选).{,2}(?:开始)?(时间|日期)|选取.{,2}(时间|日期)",entity_left3):
  3498. dict_time['time_bidopen'].append((extract_time[0], label_prob-0.1, in_attachment))
  3499. if entity.label==3 and re.search("报名",entity_left3):
  3500. dict_time['time_registrationEnd'].append((extract_time[0], 0.5, in_attachment))
  3501. if entity.label==3 and re.search("获取",entity_left3[-20:]):
  3502. dict_time['time_getFileEnd'].append((extract_time[0], 0.45, in_attachment))
  3503. if entity.label==9 and re.search("截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止|文件.{,2}([递提]交|接收)",entity_left3):
  3504. dict_time['time_bidclose'].append((extract_time[0], label_prob-0.1, in_attachment))
  3505. if entity.label in [11, 3]:
  3506. if entity.label==11 and re.search("文件.{,2}([递提]交|接收)|截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
  3507. dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
  3508. if entity.label==3 and re.search("保证金.{,2}(接受|收取)|(接受|收取).{,2}保证金",entity_left3):
  3509. dict_time['time_earnestMoneyEnd'].append((extract_time[0], 0.5, in_attachment))
  3510. if entity.label in [6, 7]:
  3511. if re.search("文件.{,2}([递提]交|接收)|截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
  3512. dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
  3513. if entity.label==0:
  3514. if re.search("文件.{,2}([递提]交|接收)|截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
  3515. if len(extract_time)>=2:
  3516. dict_time['time_bidstart'].append((extract_time[0], 0.45, in_attachment))
  3517. dict_time['time_bidclose'].append((extract_time[1], 0.45, in_attachment))
  3518. else:
  3519. dict_time['time_bidclose'].append((extract_time[0], 0.45, in_attachment))
  3520. if entity.label==6:
  3521. # "文件获取时间"和"报名时间"并列
  3522. if re.search("报名",entity_left3):
  3523. if len(extract_time)==1:
  3524. dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
  3525. else:
  3526. dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
  3527. dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment))
  3528. # 获取文件/报名/报价 时间补充(上下文表达过长无法通过模型识别)
  3529. # if entity.label == 0:
  3530. # if re.search("(获取|领取|售卖|出售|购买|下载).{,4}(招标|投标|采购)?(文件|标书)|(文件|标书).{,4}(获取|售卖|出售|发售|购买)", entity_left3):
  3531. # if len(extract_time)==2:
  3532. # dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment))
  3533. # dict_time['time_getFileEnd'].append((extract_time[1], 0.51, in_attachment))
  3534. # else:
  3535. # if next_entity and next_entity.sentence_index==entity.sentence_index:
  3536. # mid_text = sentence_text[entity.wordOffset_end:next_entity.wordOffset_begin]
  3537. # if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(next_extract_time)==1:
  3538. # dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment))
  3539. # dict_time['time_getFileEnd'].append((next_extract_time[0], 0.51, in_attachment))
  3540. # if not dict_time['time_getFileEnd']:
  3541. # if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]):
  3542. # dict_time['time_getFileEnd'].append((extract_time[0], 0.51, in_attachment))
  3543. # elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]):
  3544. # dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment))
  3545. # if re.search("(进行|在线|线下|线上|网上).{,2}报名|报名.{,2}(开始)?(时间|日期)", entity_left3):
  3546. # if len(extract_time)==2:
  3547. # dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
  3548. # dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment))
  3549. # else:
  3550. # if next_entity and next_entity.sentence_index==entity.sentence_index:
  3551. # mid_text = sentence_text[entity.wordOffset_end:next_entity.wordOffset_begin]
  3552. # if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(next_extract_time)==1:
  3553. # dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
  3554. # dict_time['time_registrationEnd'].append((next_extract_time[0], 0.51, in_attachment))
  3555. # if not dict_time['time_registrationEnd']:
  3556. # if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]):
  3557. # dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment))
  3558. # elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]):
  3559. # dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
  3560. #
  3561. # if re.search("(获取|售卖|出售|购买).{,4}(招标|投标|采购)?(文件|标书)|(文件|标书).{,4}(获取|售卖|出售|发售|购买)", entity_right2):
  3562. # if len(extract_time)==2:
  3563. # dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment))
  3564. # dict_time['time_getFileEnd'].append((extract_time[1], 0.51, in_attachment))
  3565. # else:
  3566. # if previous_entity and previous_entity.sentence_index==entity.sentence_index:
  3567. # mid_text = sentence_text[previous_entity.wordOffset_end:entity.wordOffset_begin]
  3568. # if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(previous_extract_time)==1:
  3569. # dict_time['time_getFileStart'].append((previous_extract_time[0], 0.51, in_attachment))
  3570. # dict_time['time_getFileEnd'].append((extract_time[0], 0.51, in_attachment))
  3571. # if not dict_time['time_getFileEnd']:
  3572. # if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]):
  3573. # dict_time['time_getFileEnd'].append((extract_time[0], 0.51, in_attachment))
  3574. # elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]):
  3575. # dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment))
  3576. # if re.search("(进行|在线|线下).{,2}报名", entity_right2):
  3577. # if len(extract_time) == 2:
  3578. # dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
  3579. # dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment))
  3580. # else:
  3581. # if previous_entity and previous_entity.sentence_index==entity.sentence_index:
  3582. # mid_text = sentence_text[previous_entity.wordOffset_end:entity.wordOffset_begin]
  3583. # if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(previous_extract_time)==1:
  3584. # dict_time['time_registrationStart'].append((previous_extract_time[0], 0.51, in_attachment))
  3585. # dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment))
  3586. # if not dict_time['time_registrationEnd']:
  3587. # if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]):
  3588. # dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment))
  3589. # elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]):
  3590. # dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
  3591. # if re.search("(进行|开始).{,4}(报价|投标|竞价)", entity_right2):
  3592. # if len(extract_time) == 2:
  3593. # dict_time['time_bidstart'].append((extract_time[0], 0.51, in_attachment))
  3594. # # dict_time['time_bidclose'].append((extract_time[1], 0.51, in_attachment))
  3595. # 补充公告末尾处的发布时间
  3596. if entity.label==0:
  3597. if entity.is_tail:
  3598. entity.label = 1
  3599. entity.values[1] = 0.5
  3600. dict_time['time_release'].append((extract_time[0], 0.5, in_attachment))
  3601. # 2022/12/12 新增挂牌时间正则
  3602. if re.search("挂牌.{,4}(?:时间|日期)",entity_left2):
  3603. if re.search("挂牌.{,4}(?:时间|日期)",entity_left2).end()>len(entity_left2)/2:
  3604. if len(extract_time) == 1:
  3605. if re.search("挂牌.?(开始|起始).?(?:时间|日期)",entity_left2):
  3606. dict_time['time_listingStart'].append((extract_time[0], 0.5, in_attachment))
  3607. last_time_type = 'time_listingStart'
  3608. elif re.search("挂牌.?(截[止至]|结束).?(?:时间|日期)",entity_left2):
  3609. dict_time['time_listingEnd'].append((extract_time[0], 0.5, in_attachment))
  3610. last_time_type = 'time_listingEnd'
  3611. elif re.search("挂牌.?(?:时间|日期)",entity_left2):
  3612. if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
  3613. dict_time['time_listingEnd'].append((extract_time[0], 0.5, in_attachment))
  3614. last_time_type = 'time_listingEnd'
  3615. else:
  3616. dict_time['time_listingStart'].append((extract_time[0], 0.5, in_attachment))
  3617. last_time_type = 'time_listingStart'
  3618. else:
  3619. dict_time['time_listingStart'].append((extract_time[0], 0.5, in_attachment))
  3620. dict_time['time_listingEnd'].append((extract_time[1], 0.5, in_attachment))
  3621. last_time_type = ''
  3622. last_sentence_index = entity.sentence_index
  3623. continue
  3624. # 2023/9/13 新增合同相关时间
  3625. if re.search("合同|服务|履[约行]", entity_left3[-15:]):
  3626. if len(extract_time) == 1:
  3627. if re.search("(合同.{,2}签[订定署].{,2}|签[订定署].{,2}合同.{,2})(?:时间|日期)|合同签[订定署].{,1}$", entity_left2):
  3628. dict_time['time_signContract'].append((extract_time[0], 0.5, in_attachment))
  3629. last_time_type = 'time_signContract'
  3630. last_sentence_index = entity.sentence_index
  3631. continue
  3632. elif re.search("(合同|服务|履约|(合同|服务)履行).{,4}(?:起始|开始)(?:时间|日期)", entity_left3[-15:]):
  3633. dict_time['time_contractStart'].append((extract_time[0], 0.55, in_attachment))
  3634. last_time_type = 'time_contractStart'
  3635. last_sentence_index = entity.sentence_index
  3636. continue
  3637. elif re.search("(合同|服务|履约).{,2}(?:完成|截止|结束)(?:时间|日期|时限)", entity_left2):
  3638. dict_time['time_contractEnd'].append((extract_time[0], 0.55, in_attachment))
  3639. last_time_type = 'time_contractEnd'
  3640. last_sentence_index = entity.sentence_index
  3641. continue
  3642. elif re.search("(?:合同|服务|履约|(合同|服务)履行)(?:期限?|有效期)|(?:服务|履约|(合同|服务)履行)(?:时间|日期|周期)|服务[时年]限|合同周期", entity_left2):
  3643. if re.search("到|至|截[至止]",entity_left) or re.search("前|止|截止",entity_right) or re.search("前",entity_text[-2:]):
  3644. dict_time['time_contractEnd'].append((extract_time[0], 0.5, in_attachment))
  3645. last_time_type = 'time_contractEnd'
  3646. else:
  3647. dict_time['time_contractStart'].append((extract_time[0], 0.5, in_attachment))
  3648. last_time_type = 'time_contractStart'
  3649. last_sentence_index = entity.sentence_index
  3650. continue
  3651. else:
  3652. if re.search("(?:合同|服务|履约|(合同|服务)履行)(?:期限?|有效期)|(?:服务|履约|(合同|服务)履行)(?:时间|日期|周期)|服务[时年]限|合同周期", entity_left2):
  3653. # 排除开始和借宿时间一样的错误模板,例:“履约期限:2023年02月15日至2023年02月15日”
  3654. if extract_time[0]!=extract_time[1]:
  3655. dict_time['time_contractStart'].append((extract_time[0], 0.6, in_attachment))
  3656. dict_time['time_contractEnd'].append((extract_time[1], 0.6, in_attachment))
  3657. last_time_type = ''
  3658. last_sentence_index = entity.sentence_index
  3659. continue
  3660. # 服务期限表达补充
  3661. if entity.label==0:
  3662. re_service = '合同期限|工期/交货期/服务期|工期\(交货期\)|合格工期|服务期限|工期' \
  3663. '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期限' \
  3664. '|合格工期|计划工期\(服务期\)|服务期|服务,期|交货\(完工\)(时间|日期)|交付\(服务、完工\)(时间|日期)' \
  3665. '|交货(时间|日期)|工期承诺|(服务|合同|施工|实施|工程|设计)的?(年限|期限|周期|期:)' \
  3666. '|服务期限为|计划工期|工期要求|服务期限|服务期' \
  3667. '|投标工期|设计工期|合格服务周期|总工期|服务(时间|日期)(范围)?|流转期限|维护期限|服务时限|交货期' \
  3668. '|完成(时间|日期)|服务期限|中标工期|项目周期|期限要求|供货期|合同履行日期|计划的?周期' \
  3669. '|履约期限|合同约定完成时限|合同完成日期|承诺完成日期' \
  3670. '|合同起始日起|合同履约期|履约截止日期|承包期限|合同完成日期' \
  3671. '|服务期间|服务履行期|委托(管理)?期限|履约期限、地点等简要信息'
  3672. if len(extract_time)==2:
  3673. if re.search(re_service,entity_left2) or re.search("履约期限、地点等简要信息",entity_left3[-20:]):
  3674. dict_time['time_contractStart'].append((extract_time[0], 0.5, in_attachment))
  3675. dict_time['time_contractEnd'].append((extract_time[1], 0.5, in_attachment))
  3676. last_time_type = ''
  3677. # 报价/投标时间补充(规则补充)
  3678. if entity.label == 0:
  3679. if re.search("[报竞]价.{,2}(开始|起始).{,2}(时间|日期)",entity_left2):
  3680. entity.label = 12
  3681. label_prob = 0.8
  3682. elif re.search("[报竞]价.{,2}起止.{,2}(时间|日期)",entity_left2):
  3683. entity.label = 12
  3684. label_prob = 0.6
  3685. elif re.search("响应.{,2}文件([递提]交|接收).{,2}(时间|日期)[::]|([递提]交|接收).{,2}响应.{,2}文件.{,2}(时间|日期)[::]",entity_left2):
  3686. entity.label = 3
  3687. label_prob = 0.501
  3688. elif re.search("响应.{,2}文件([递提]交|接收).{,2}(时间|日期)|([递提]交|接收).{,2}响应.{,2}文件.{,2}(时间|日期)",entity_left2) and not re.search("截[止至]",entity_left2):
  3689. entity.label = 12
  3690. label_prob = 0.51
  3691. elif re.search("[报竞]价.{,2}截[止至].{,2}(时间|日期)",entity_left2):
  3692. entity.label = 3
  3693. label_prob = 0.8
  3694. elif re.search("(竞价|报价).?(时间|日期)",entity_left2):
  3695. entity.label = 12
  3696. label_prob = 0.51
  3697. elif re.search("(竞价|报价).?(时间|日期)",entity_left3) and re.search("参与|报价|有意",entity_left2):
  3698. entity.label = 12
  3699. label_prob = 0.501
  3700. # 文档结构补充
  3701. # if entity.label == 0:
  3702. # re_registration = re.compile("报名|(文件|标书)[\u4e00-\u9fa5、]{,4}(获取|出售|售卖|购买|下载)|"
  3703. # "(获取|出售|售卖|购买|下载)[\u4e00-\u9fa5、]{,4}(文件|标书)")
  3704. # _data_i = -1
  3705. # while _data_i < len(document_tree) - 1:
  3706. # _data_i += 1
  3707. # _data = document_tree[_data_i]
  3708. # _type = _data["type"]
  3709. # _text = _data["text"].strip()
  3710. # childs = get_childs([_data])
  3711. # last_child = childs[-1]
  3712. # if entity.sentence_index>=_data.sentence_index and entity.wordOffset_begin>=_data.wordOffset_begin and
  3713. # ():
  3714. # if re.search(re_registration, re.split("[::;;,]", _text)[0][:20]) is not None:
  3715. #
  3716. # content_text = ""
  3717. # for c in childs:
  3718. # content_text += c["text"] + ""
  3719. # print('concat_text', content_text)
  3720. if re.search("[,;](完成|截止|结束)(时间|日期)", entity_left2[-8:]) and entity.label==0:
  3721. if entity.sentence_index == last_sentence_index:
  3722. time_type = last_time_index.get(last_time_type)
  3723. if time_type:
  3724. dict_time[time_type].append((extract_time[0], 0.5 + label_prob / 10,in_attachment))
  3725. last_time_type = ""
  3726. last_sentence_index = entity.sentence_index
  3727. continue
  3728. if re.search("至|到|[日\d][-—]$|[~~]", entity_left):
  3729. if entity.sentence_index == last_sentence_index:
  3730. time_type = last_time_index.get(last_time_type)
  3731. if time_type:
  3732. dict_time[time_type].append((extract_time[0], 0.5 + label_prob / 10,in_attachment))
  3733. last_time_type = ""
  3734. last_sentence_index = entity.sentence_index
  3735. continue
  3736. if entity.label!=0:
  3737. if entity.label==1 and label_prob>0.5:
  3738. dict_time['time_release'].append((extract_time[0],label_prob,in_attachment))
  3739. last_time_type = 'time_release'
  3740. elif entity.label==2 and label_prob>0.5:
  3741. dict_time['time_bidopen'].append((extract_time[0],label_prob,in_attachment))
  3742. last_time_type = 'time_bidopen'
  3743. elif entity.label==3 and label_prob>0.5:
  3744. if len(extract_time)==1:
  3745. dict_time['time_bidclose'].append((extract_time[0],label_prob,in_attachment))
  3746. last_time_type = 'time_bidclose'
  3747. elif len(extract_time)==2:
  3748. dict_time['time_bidstart'].append((extract_time[0], 0.6, in_attachment))
  3749. dict_time['time_bidclose'].append((extract_time[1], label_prob, in_attachment))
  3750. last_time_type = 'time_bidclose'
  3751. elif entity.label==12 and label_prob>0.5:
  3752. if len(extract_time)==1:
  3753. if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
  3754. dict_time['time_bidclose'].append((extract_time[0], label_prob,in_attachment))
  3755. last_time_type = 'time_bidclose'
  3756. else:
  3757. dict_time['time_bidstart'].append((extract_time[0], label_prob,in_attachment))
  3758. last_time_type = 'time_bidstart'
  3759. else:
  3760. dict_time['time_bidstart'].append((extract_time[0],label_prob,in_attachment))
  3761. dict_time['time_bidclose'].append((extract_time[1],label_prob,in_attachment))
  3762. last_time_type = ''
  3763. elif entity.label==4 and label_prob>0.5:
  3764. if len(extract_time)==1:
  3765. if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
  3766. dict_time['time_publicityEnd'].append((extract_time[0], label_prob,in_attachment))
  3767. last_time_type = 'time_publicityEnd'
  3768. else:
  3769. dict_time['time_publicityStart'].append((extract_time[0], label_prob,in_attachment))
  3770. last_time_type = 'time_publicityStart'
  3771. else:
  3772. dict_time['time_publicityStart'].append((extract_time[0],label_prob,in_attachment))
  3773. dict_time['time_publicityEnd'].append((extract_time[1],label_prob,in_attachment))
  3774. last_time_type = ''
  3775. elif entity.label==5 and label_prob>0.5:
  3776. if len(extract_time)==1:
  3777. dict_time['time_publicityEnd'].append((extract_time[0], label_prob,in_attachment))
  3778. last_time_type = 'time_publicityEnd'
  3779. else:
  3780. dict_time['time_publicityStart'].append((extract_time[0],label_prob,in_attachment))
  3781. dict_time['time_publicityEnd'].append((extract_time[1],label_prob,in_attachment))
  3782. last_time_type = ''
  3783. elif entity.label==6 and label_prob>0.5:
  3784. if len(extract_time)==1:
  3785. if (re.search("前|截?止",entity_right) and re.search("前|截?止(?!时间|日期)",entity_right2[:len(entity_right)+3])) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
  3786. dict_time['time_getFileEnd'].append((extract_time[0], label_prob,in_attachment))
  3787. last_time_type = 'time_getFileEnd'
  3788. else:
  3789. dict_time['time_getFileStart'].append((extract_time[0], label_prob,in_attachment))
  3790. last_time_type = 'time_getFileStart'
  3791. else:
  3792. dict_time['time_getFileStart'].append((extract_time[0],label_prob,in_attachment))
  3793. dict_time['time_getFileEnd'].append((extract_time[1],label_prob,in_attachment))
  3794. last_time_type = ''
  3795. elif entity.label==7 and label_prob>0.5:
  3796. if len(extract_time)==1:
  3797. dict_time['time_getFileEnd'].append((extract_time[0], label_prob,in_attachment))
  3798. last_time_type = 'time_getFileEnd'
  3799. else:
  3800. dict_time['time_getFileStart'].append((extract_time[0],label_prob,in_attachment))
  3801. dict_time['time_getFileEnd'].append((extract_time[1],label_prob,in_attachment))
  3802. last_time_type = ''
  3803. elif entity.label==8 and label_prob>0.5:
  3804. if len(extract_time)==1:
  3805. if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
  3806. dict_time['time_registrationEnd'].append((extract_time[0], label_prob,in_attachment))
  3807. last_time_type = 'time_registrationEnd'
  3808. else:
  3809. dict_time['time_registrationStart'].append((extract_time[0], label_prob,in_attachment))
  3810. last_time_type = 'time_registrationStart'
  3811. else:
  3812. dict_time['time_registrationStart'].append((extract_time[0],label_prob,in_attachment))
  3813. dict_time['time_registrationEnd'].append((extract_time[1],label_prob,in_attachment))
  3814. last_time_type = ''
  3815. elif entity.label==9 and label_prob>0.5:
  3816. if len(extract_time)==1:
  3817. dict_time['time_registrationEnd'].append((extract_time[0], label_prob,in_attachment))
  3818. last_time_type = 'time_registrationEnd'
  3819. else:
  3820. dict_time['time_registrationStart'].append((extract_time[0],label_prob,in_attachment))
  3821. dict_time['time_registrationEnd'].append((extract_time[1],label_prob,in_attachment))
  3822. last_time_type = ''
  3823. elif entity.label==10 and label_prob>0.5:
  3824. if len(extract_time)==1:
  3825. if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
  3826. dict_time['time_earnestMoneyEnd'].append((extract_time[0], label_prob,in_attachment))
  3827. last_time_type = 'time_earnestMoneyEnd'
  3828. else:
  3829. dict_time['time_earnestMoneyStart'].append((extract_time[0], label_prob,in_attachment))
  3830. last_time_type = 'time_earnestMoneyStart'
  3831. else:
  3832. dict_time['time_earnestMoneyStart'].append((extract_time[0],label_prob,in_attachment))
  3833. dict_time['time_earnestMoneyEnd'].append((extract_time[1],label_prob,in_attachment))
  3834. last_time_type = ''
  3835. elif entity.label==11 and label_prob>0.5:
  3836. if len(extract_time)==1:
  3837. dict_time['time_earnestMoneyEnd'].append((extract_time[0], label_prob,in_attachment))
  3838. last_time_type = 'time_earnestMoneyEnd'
  3839. else:
  3840. dict_time['time_earnestMoneyStart'].append((extract_time[0],label_prob,in_attachment))
  3841. dict_time['time_earnestMoneyEnd'].append((extract_time[1],label_prob,in_attachment))
  3842. last_time_type = ''
  3843. elif entity.label==13 and label_prob>0.5:
  3844. if len(extract_time)==1:
  3845. if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
  3846. dict_time['time_completion'].append((extract_time[0], label_prob,in_attachment))
  3847. last_time_type = 'time_completion'
  3848. else:
  3849. dict_time['time_commencement'].append((extract_time[0], label_prob,in_attachment))
  3850. last_time_type = 'time_commencement'
  3851. else:
  3852. dict_time['time_commencement'].append((extract_time[0],label_prob,in_attachment))
  3853. dict_time['time_completion'].append((extract_time[1],label_prob,in_attachment))
  3854. last_time_type = ''
  3855. elif entity.label==14 and label_prob>0.5:
  3856. if len(extract_time)==1:
  3857. dict_time['time_completion'].append((extract_time[0], label_prob,in_attachment))
  3858. last_time_type = 'time_completion'
  3859. else:
  3860. dict_time['time_commencement'].append((extract_time[0],label_prob,in_attachment))
  3861. dict_time['time_completion'].append((extract_time[1],label_prob,in_attachment))
  3862. last_time_type = ''
  3863. else:
  3864. last_time_type = ""
  3865. else:
  3866. last_time_type = ""
  3867. else:
  3868. last_time_type = ""
  3869. last_sentence_index = entity.sentence_index
  3870. # 通过文档分析树形结构补充部分时间实体
  3871. def add_time_by_parseDocument(dict_time,parse_document):
  3872. from BiddingKG.dl.interface.htmlparser import get_childs
  3873. document_tree = parse_document.tree
  3874. # if not dict_time['time_getFileStart'] or not dict_time['time_getFileEnd']:
  3875. # time_pattern = re.compile("")
  3876. concat_text_list = []
  3877. if not dict_time['time_registrationStart'] or not dict_time['time_registrationEnd']:
  3878. re_registration = re.compile("报名|(文件|标书)[\u4e00-\u9fa5、]{,4}(获取|出售|售卖|购买|下载)|"
  3879. "(获取|出售|售卖|购买|下载)[\u4e00-\u9fa5、]{,4}(文件|标书)")
  3880. _data_i = -1
  3881. while _data_i < len(document_tree) - 1:
  3882. _data_i += 1
  3883. _data = document_tree[_data_i]
  3884. _type = _data["type"]
  3885. _text = _data["text"].strip()
  3886. # print(_data.keys())
  3887. if _type == "sentence":
  3888. print('_text:',_text,_data["sentence_title"])
  3889. if _data["sentence_title"] is not None:
  3890. print("aptitude_pattern", _text)
  3891. print(_data['sentence_index'],_data['wordOffset_begin'],_data['wordOffset_end'])
  3892. if re.search(re_registration, re.split("[::;;。]",_text)[0][:15]) is not None:
  3893. childs = get_childs([_data])
  3894. concat_text = ""
  3895. for c in childs:
  3896. concat_text += c["text"] + ""
  3897. print('concat_text',concat_text)
  3898. concat_text_list.append(concat_text)
  3899. _data_i += len(childs)-1
  3900. # if _type == "table":
  3901. # list_table = _data["list_table"]
  3902. # parent_title = _data["parent_title"]
  3903. # if list_table is not None:
  3904. # for line in list_table[:2]:
  3905. # for cell_i in range(len(line)):
  3906. # cell = line[cell_i]
  3907. # cell_text = cell[0]
  3908. # if len(cell_text) > 120 and re.search(re_registration, cell_text) is not None:
  3909. # concat_text += cell_text + "\n"
  3910. print('_text',concat_text_list)
  3911. for text in concat_text_list:
  3912. time_list = re.finditer(my_time_format_pattern,text)
  3913. time_list = [(i,my_timeFormat(i.group(),page_time)) for i in time_list]
  3914. for time_idx in range(len(time_list)):
  3915. _time = time_list[time_idx][0]
  3916. extract_time = time_list[time_idx][1]
  3917. entity_left = text[:_time.start()]
  3918. entity_left = re.split("[。;;!??]",entity_left)[-1]
  3919. # entity_left2 = sentence_text[
  3920. # max(entity_context_begin, entity.wordOffset_begin - 10):entity.wordOffset_begin]
  3921. # entity_left3 = sentence_text[
  3922. # max(entity_context_begin, entity.wordOffset_begin - 30):entity.wordOffset_begin]
  3923. entity_right = text[_time.end():]
  3924. entity_right = re.split("[。;;!??]",entity_right)[0]
  3925. # entity_right2 = sentence_text[entity.wordOffset_end:entity_context_end]
  3926. entity_right2 = re.sub(r"(http[s]?://)?(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F])){6,}",
  3927. '', entity_right)[:60] # 去除网址
  3928. print('entity_right2',entity_right2)
  3929. if re.search("(进行|在线|线下).{,2}报名", entity_right2):
  3930. print('报名text',entity_right2)
  3931. if len(extract_time) == 2:
  3932. dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
  3933. dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment))
  3934. else:
  3935. if previous_entity and previous_entity.sentence_index==entity.sentence_index:
  3936. mid_text = sentence_text[previous_entity.wordOffset_end:entity.wordOffset_begin]
  3937. if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(previous_extract_time)==1:
  3938. dict_time['time_registrationStart'].append((previous_extract_time[0], 0.51, in_attachment))
  3939. dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment))
  3940. if not dict_time['time_registrationEnd']:
  3941. if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]):
  3942. dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment))
  3943. elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]):
  3944. dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
  3945. return dict_time
  3946. # dict_time = add_time_by_parseDocument(dict_time,parse_document)
  3947. # print(dict_time)
  3948. result_dict = dict((key,"") for key in dict_time.keys())
  3949. for time_type,value in dict_time.items():
  3950. list_time = dict_time[time_type]
  3951. if list_time:
  3952. for in_attachment in [False,True]:
  3953. _list_time = [_time for _time in list_time if _time[2]==in_attachment]
  3954. if _list_time:
  3955. _list_time.sort(key=lambda x:(x[1],len(x[0])),reverse=True) # sort_key: label_prob,时间文本长度(优先有具体时分秒的)
  3956. if in_attachment==True and len(result_dict[time_type])>0:
  3957. break
  3958. result_dict[time_type] = _list_time[0][0]
  3959. # result_dict 纠错
  3960. if not result_dict['time_bidclose']:
  3961. if result_dict['time_bidstart']: # 无截标时间,投标开始和开标时间一样
  3962. if result_dict['time_bidstart'][:10] in result_dict['time_bidopen']:
  3963. result_dict['time_bidstart'] = ""
  3964. result_dict['time_bidclose'] = result_dict['time_bidopen']
  3965. if not result_dict['time_bidclose']:
  3966. if result_dict['time_getFileEnd']: # 无截标时间,获取文件截止时间和开标时间一样
  3967. if result_dict['time_getFileEnd'][:10] in result_dict['time_bidopen']:
  3968. result_dict['time_bidclose'] = result_dict['time_bidopen']
  3969. else:
  3970. if result_dict['time_bidopen']: # 截标时间 和 开标时间 时分秒互补
  3971. if len(result_dict['time_bidclose'])<len(result_dict['time_bidopen']) and result_dict['time_bidclose'] in result_dict['time_bidopen']:
  3972. result_dict['time_bidclose'] = result_dict['time_bidopen']
  3973. elif len(result_dict['time_bidclose'])>len(result_dict['time_bidopen']) and result_dict['time_bidopen'] in result_dict['time_bidclose']:
  3974. result_dict['time_bidopen'] = result_dict['time_bidclose']
  3975. return result_dict
  3976. def get_days_between(day1,day2,get_abs=0):
  3977. '''
  3978. :param day1: 较小日期
  3979. :param day2: 较大日期
  3980. :param get_abs: 是否取绝对值
  3981. :return: 天数差
  3982. '''
  3983. # 将日期字符串转换为datetime对象
  3984. date1 = datetime.strptime(day1, '%Y-%m-%d')
  3985. date2 = datetime.strptime(day2, '%Y-%m-%d')
  3986. # 计算日期差
  3987. delta = date2 - date1
  3988. # 获取天数差
  3989. days_difference = delta.days
  3990. if get_abs:
  3991. return abs(days_difference)
  3992. else:
  3993. return days_difference
  3994. def extract_serviceTime(service_time,page_time):
  3995. pattern1 = re.compile("\d{4}[年\-./]\d{1,2}[月\-./]\d{1,2}日?")
  3996. pattern2 = re.compile("\d+(?:\.\d+)?[((]?个?[^\d]?[^\d]?(?:日|天|周年|整年|学?年|月|周|日历[天日]|工作[天日])")
  3997. pattern3 = re.compile("\d{4}[年\-./]\d{1,2}月?")
  3998. pattern4 = re.compile("(?:日|天|周年|年|月|周|日历[天日]|工作[天日]|星期)[^\d]{1,3}\d+(?:\.\d+)?")
  3999. DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
  4000. "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9,
  4001. "两":2, '貮': 2}
  4002. def get_month_days(year, month):
  4003. # calendar.monthrange(year, month)返回一个元组,其中第一个元素是月份的第一天是星期几(0-6为星期一到星期日),
  4004. # 第二个元素是该月的天数。
  4005. _, last_day = calendar.monthrange(year, month)
  4006. return last_day
  4007. def get_num(text):
  4008. CN_UNIT = {'十': 10,'拾': 10,'百': 100,
  4009. '佰': 100,'千': 1000,'仟': 1000}
  4010. regex = re.compile(r'[〇一二三四五六七八九零壹贰叁肆伍陆柒捌玖貮两十拾百佰千仟]+')
  4011. text = regex.search(text)
  4012. if text:
  4013. text = text.group()
  4014. else:
  4015. return ""
  4016. result = 0
  4017. result_list = []
  4018. unit = 0
  4019. control = 0
  4020. for i, d in enumerate(text):
  4021. if d in '零百佰千仟' and i == 0:
  4022. return ""
  4023. if d in DigitsDic:
  4024. result += DigitsDic[d]
  4025. elif d in CN_UNIT:
  4026. if unit == 0:
  4027. unit_1 = CN_UNIT[d]
  4028. # 这里的处理主要是考虑到类似于二十三亿五千万这种数
  4029. if result == 0:
  4030. result = CN_UNIT[d]
  4031. else:
  4032. result *= CN_UNIT[d]
  4033. unit = CN_UNIT[d]
  4034. result_1 = result
  4035. elif unit > CN_UNIT[d]:
  4036. result -= DigitsDic[text[i - 1]]
  4037. result += DigitsDic[text[i - 1]] * CN_UNIT[d]
  4038. unit = CN_UNIT[d]
  4039. elif unit <= CN_UNIT[d]:
  4040. if (CN_UNIT[d] < unit_1) and (len(result_list) == control):
  4041. result_list.append(result_1)
  4042. result = (result - result_1) * CN_UNIT[d]
  4043. control += 1
  4044. else:
  4045. result *= CN_UNIT[d]
  4046. unit = CN_UNIT[d]
  4047. if len(result_list) == control:
  4048. unit_1 = unit
  4049. result_1 = result
  4050. else:
  4051. return ""
  4052. return sum(result_list) + result
  4053. serviceTime_dict = {"service_start": "", "service_end": "", "service_days": 0}
  4054. re_num = re.findall(r'[〇一二三四五六七八九零壹贰叁肆伍陆柒捌玖貮两十拾百佰千仟]+',service_time)
  4055. for _num in re_num:
  4056. if not re.search("[十拾百佰千仟]",_num):
  4057. num = ""
  4058. for word in _num:
  4059. num += str(DigitsDic.get(word,word))
  4060. service_time = service_time.replace(_num,num,1)
  4061. else:
  4062. num = str(get_num(_num))
  4063. service_time = service_time.replace(_num,num,1)
  4064. end_time = ""
  4065. service_days = 0
  4066. re_page_time = re.search("20\d{2}-\d{2}-\d{2}", page_time)
  4067. page_time = re_page_time.group() if re_page_time else "2000-01-01" # page_time为空时默认值为2000-01-01
  4068. if re.search(pattern1,service_time):
  4069. # end_time = re.findall(pattern1,service_time)[-1]
  4070. time_list = []
  4071. for _time in re.findall(pattern1,service_time):
  4072. _time = re.sub("日","",_time)
  4073. _time = re.sub("[年月./]","-",_time)
  4074. _year,_month,_day = _time.split("-")
  4075. _month = int(_month)
  4076. _day = int(_day)
  4077. _year = int(_year)
  4078. if _year>2050 or _year<=2000 or _month>12 or _month<=0 or _day<=0 or _day>31:
  4079. service_days = 0
  4080. else:
  4081. if isValidDate(_year,_month,_day):
  4082. _time = str(_year)+'-'+str(_month)+'-'+str(_day)
  4083. _time = _time.split("-")[0] + '-' + _time.split("-")[1].rjust(2,"0") + '-' + _time.split("-")[2].rjust(2,"0")
  4084. time_list.append(_time)
  4085. if len(time_list)>=2:
  4086. if get_days_between(page_time,time_list[1])>1 and get_days_between(time_list[0],time_list[1])>0:
  4087. serviceTime_dict['service_end'] = time_list[1]
  4088. serviceTime_dict['service_start'] = time_list[0]
  4089. elif len(time_list)==1:
  4090. if get_days_between(page_time, time_list[0]) > 1:
  4091. serviceTime_dict['service_end'] = time_list[0]
  4092. # service_days = (time.mktime(time.strptime(end_time,"%Y-%m-%d"))-page_timestamp)/(24*60*60)
  4093. elif re.search(pattern3,service_time):
  4094. time_list = []
  4095. # end_time = re.findall(pattern3,service_time)[-1]
  4096. for _time in re.findall(pattern3,service_time):
  4097. _time = re.sub("月","",_time)
  4098. _time = re.sub("[年./]","-",_time)
  4099. _year,_month = _time.split("-")
  4100. _day = 0
  4101. _month = int(_month)
  4102. _year = int(_year)
  4103. if _year>2050 or _year<=2000 or _month>12 or _month<=0:
  4104. service_days = 0
  4105. else:
  4106. _day = get_month_days(_year,_month)
  4107. if isValidDate(_year, _month, _day):
  4108. _time = str(_year)+'-'+str(_month)+'-'+str(_day)
  4109. _time = _time.split("-")[0] + '-' + _time.split("-")[1].rjust(2,"0") + '-' + _time.split("-")[2].rjust(2,"0")
  4110. time_list.append(_time)
  4111. if len(time_list) >= 2:
  4112. if get_days_between(page_time, time_list[1]) > 1 and get_days_between(time_list[0], time_list[1]) > 0:
  4113. serviceTime_dict['service_end'] = time_list[1]
  4114. serviceTime_dict['service_start'] = time_list[0]
  4115. elif len(time_list)==1:
  4116. if get_days_between(page_time, time_list[0]) > 1:
  4117. serviceTime_dict['service_end'] = time_list[0]
  4118. # service_days = (time.mktime(time.strptime(end_time,"%Y-%m-%d"))-page_timestamp)/(24*60*60)
  4119. elif re.search(pattern2,service_time) or re.search(pattern4,service_time):
  4120. for pattern in [pattern2,pattern4]:
  4121. unit = 1
  4122. match = re.findall(pattern,service_time)
  4123. if len(set(match))==1:
  4124. match_text = match[0]
  4125. if "月" in match_text:
  4126. unit = 30
  4127. elif "年" in match_text:
  4128. unit = 365
  4129. elif "周" in match_text or "星期" in match_text:
  4130. unit = 7
  4131. match_num = float(re.search("\d+",match_text).group())
  4132. # 数字能被365整除,单位更正为天
  4133. if int(match_num)%365==0:
  4134. unit = 1
  4135. if unit==365:
  4136. if match_num>10:#单位为'年'时,排除数字过大的
  4137. match_num = 0
  4138. elif unit==30:
  4139. if match_num>60:#单位为'月'时,排除数字过大的
  4140. match_num = 0
  4141. elif unit==1:
  4142. if match_num>4000:#单位为'日'时,排除数字过大的
  4143. match_num = 0
  4144. service_days = int(match_num * unit)
  4145. if service_days % 360==0:
  4146. service_days = service_days / 360 * 365
  4147. elif service_days % 180==0 and service_days % 360!=0:
  4148. service_days = service_days // 360 * 365 + 180
  4149. service_days = int(service_days)
  4150. if service_days <= 1 and service_days > 4000:
  4151. service_days = 0
  4152. # if service_days>3:
  4153. if service_days>0:
  4154. # service_days = str(service_days) + "天"
  4155. serviceTime_dict['service_days'] = service_days
  4156. break
  4157. elif "半年" in service_time:
  4158. service_days = 180
  4159. # service_days = str(service_days) + "天"
  4160. serviceTime_dict['service_days'] = service_days
  4161. if serviceTime_dict['service_start'] and serviceTime_dict['service_end']:
  4162. service_days = get_days_between(serviceTime_dict['service_start'],serviceTime_dict['service_end'])
  4163. serviceTime_dict['service_days'] = service_days
  4164. return serviceTime_dict
  4165. def getServiceTime():
  4166. pass
  4167. def getOtherAttributes(list_entity,page_time,prem,channel_dic):
  4168. dict_other = {"moneysource":"",
  4169. "person_review":[],
  4170. "serviceTime":"",
  4171. "product":[],
  4172. "total_tendereeMoney":0,
  4173. "total_tendereeMoneyUnit":''}
  4174. list_serviceTime = []
  4175. last_moneysource_prob = 0
  4176. for entity in list_entity:
  4177. if entity.entity_type == 'bidway':
  4178. dict_other["bidway"] = turnBidWay(entity.entity_text)
  4179. elif entity.entity_type=='moneysource':
  4180. if dict_other["moneysource"] and entity.in_attachment:
  4181. continue
  4182. if not dict_other["moneysource"]:
  4183. dict_other["moneysource"] = entity.entity_text
  4184. last_moneysource_prob = entity.prob
  4185. elif entity.prob>last_moneysource_prob:
  4186. dict_other["moneysource"] = entity.entity_text
  4187. last_moneysource_prob = entity.prob
  4188. elif entity.entity_type=='serviceTime':
  4189. # print(entity.entity_text)
  4190. # if list_serviceTime and entity.in_attachment:
  4191. # continue
  4192. if re.search("[^之]日|天|年|月|周|星期", entity.entity_text) or re.search("\d{4}[-./]\d{1,2}", entity.entity_text):
  4193. list_serviceTime.append(entity)
  4194. elif entity.entity_type=="person" and entity.label ==4 and entity.entity_text not in dict_other["person_review"]: # 20240624评审专家去重
  4195. dict_other["person_review"].append(entity.entity_text)
  4196. elif entity.entity_type=='product' and entity.entity_text not in dict_other["product"]: #顺序去重保留
  4197. dict_other["product"].append(entity.entity_text)
  4198. elif entity.entity_type=='money' and entity.notes=='总投资' and float(dict_other["total_tendereeMoney"])<float(entity.entity_text):
  4199. dict_other["total_tendereeMoney"] = str(Decimal(entity.entity_text))
  4200. dict_other["total_tendereeMoneyUnit"] = entity.money_unit
  4201. time_contractEnd = prem[0].get("time_contractEnd","")[:10]
  4202. time_contractStart = prem[0].get("time_contractStart","")[:10]
  4203. serviceTime_dict = {"service_start":"", "service_end":"", "service_days": 0}
  4204. if time_contractEnd:
  4205. serviceTime_dict['service_end'] = time_contractEnd
  4206. if time_contractStart:
  4207. if get_days_between(time_contractStart,time_contractEnd)>0:
  4208. serviceTime_dict['service_start'] = time_contractStart
  4209. # print([i.entity_text for i in list_serviceTime])
  4210. if list_serviceTime and not serviceTime_dict['service_end']:
  4211. list_serviceTime_inAtt = [serviceTime for serviceTime in list_serviceTime if serviceTime.in_attachment==1]
  4212. list_serviceTime = [serviceTime for serviceTime in list_serviceTime if serviceTime.in_attachment==0]
  4213. error_serviceTime = []
  4214. for list_time in [list_serviceTime,list_serviceTime_inAtt]:
  4215. if not serviceTime_dict['service_end'] and not serviceTime_dict['service_days']:
  4216. list_time.sort(key=lambda x: (x.prob,-x.sentence_index,-x.begin_index), reverse=True)
  4217. for _serviceTime in list_time:
  4218. # 优先取具体时间(20XX年x月x日-20XX年x月x日)
  4219. if re.search("20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾;;]{,4}20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?",_serviceTime.entity_text):
  4220. _extract_time = my_timeFormat(_serviceTime.entity_text,page_time)
  4221. if _extract_time and len(_extract_time)==2:
  4222. # 排除开始和结束时间一样的错误模板,例:“履约期限:2023年02月15日至2023年02月15日”
  4223. if _extract_time[0]!=_extract_time[1]:
  4224. # dict_other["serviceTime"] = _serviceTime.entity_text
  4225. # extract_time = extract_serviceTime(_serviceTime.entity_text)
  4226. # if extract_time['service_end']:
  4227. serviceTime_dict['service_start'] = _extract_time[0]
  4228. serviceTime_dict['service_end'] = _extract_time[1]
  4229. break
  4230. else:
  4231. error_serviceTime.append(_serviceTime.entity_text)
  4232. if not serviceTime_dict['service_end']:
  4233. for _serviceTime in list_time:
  4234. # 优先取具体时间(20XX年x月-20XX年x月)
  4235. if re.search("20\d{2}[年/.\-]\d{1,2}月?[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾;;]{,3}20\d{2}[年/.\-]\d{1,2}月?", _serviceTime.entity_text):
  4236. # dict_other["serviceTime"] = _serviceTime.entity_text
  4237. extract_time = extract_serviceTime(_serviceTime.entity_text,page_time)
  4238. if extract_time['service_end']:
  4239. serviceTime_dict = extract_time
  4240. break
  4241. if not serviceTime_dict['service_end']:
  4242. for _serviceTime in list_time:
  4243. # 优先取具体时间(20XX年x月x日)
  4244. if re.search("20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?",_serviceTime.entity_text):
  4245. if _serviceTime.entity_text not in error_serviceTime:
  4246. # dict_other["serviceTime"] = _serviceTime.entity_text
  4247. extract_time = extract_serviceTime(_serviceTime.entity_text,page_time)
  4248. if extract_time['service_end']:
  4249. serviceTime_dict = extract_time
  4250. break
  4251. if not serviceTime_dict['service_end'] and not serviceTime_dict['service_days']:
  4252. for _serviceTime in list_time:
  4253. if _serviceTime.entity_text not in error_serviceTime:
  4254. # dict_other["serviceTime"] = _serviceTime.entity_text
  4255. extract_time = extract_serviceTime(_serviceTime.entity_text,page_time)
  4256. # service_days > 3
  4257. if extract_time['service_end'] or extract_time['service_days']>3:
  4258. serviceTime_dict = extract_time
  4259. break
  4260. # 若上一步仍无结果,取消service_days > 3 的条件
  4261. if not serviceTime_dict['service_end'] and not serviceTime_dict['service_days']:
  4262. for _serviceTime in list_time:
  4263. if _serviceTime.entity_text not in error_serviceTime:
  4264. # dict_other["serviceTime"] = _serviceTime.entity_text
  4265. extract_time = extract_serviceTime(_serviceTime.entity_text,page_time)
  4266. if extract_time['service_end'] or extract_time['service_days']:
  4267. serviceTime_dict = extract_time
  4268. break
  4269. if serviceTime_dict['service_start'] and serviceTime_dict['service_end']:
  4270. service_days = get_days_between(serviceTime_dict['service_start'],serviceTime_dict['service_end'])
  4271. serviceTime_dict['service_days'] = service_days
  4272. dict_other["serviceTime"] = serviceTime_dict
  4273. if not time_contractEnd and channel_dic['docchannel']['docchannel']=='合同公告': # 用serviceTime补充合同开始结束时间,公告类型为合同公告
  4274. if serviceTime_dict['service_start'] and serviceTime_dict['service_end']:
  4275. prem[0]["time_contractStart"] = serviceTime_dict['service_start']
  4276. prem[0]["time_contractEnd"] = serviceTime_dict['service_end']
  4277. if dict_other['moneysource']:
  4278. dict_other['moneysource'] = turnMoneySource(dict_other['moneysource'])
  4279. # dict_other["product"] = list(set(dict_other["product"])) # 已在添加时 顺序去重保留
  4280. return dict_other
  4281. def getMoneyRange(RoleList):
  4282. pass
  4283. def getProjectContacts(list_entity, list_sentence):
  4284. # project_contacts "项目联系人"提取
  4285. temp_person_entitys = [ent for ent in list_entity if ent.entity_type=='person' and ent.label in [1,2,3]]
  4286. temp_person_entitys = sorted(temp_person_entitys,key=lambda x:(x.sentence_index,x.wordOffset_begin))
  4287. project_contacts_patterns = ['项目.?联系[人方]','项目.?联系.?方式', '项目.?负责人']
  4288. project_contacts_patterns_prob = [0.9, 0.85, 0.8]
  4289. project_contacts_patterns_res = []
  4290. for ent in temp_person_entitys:
  4291. sent_idx = ent.sentence_index
  4292. word_begin = ent.wordOffset_begin
  4293. # word_end = ent.wordOffset_end
  4294. in_att = ent.in_attachment
  4295. if word_begin >= 5: # > len('项目联系人')
  4296. left_text = list_sentence[sent_idx].sentence_text[max(0, word_begin - 15):word_begin]
  4297. # print('left_text', left_text)
  4298. for pattern, prob in zip(project_contacts_patterns, project_contacts_patterns_prob):
  4299. if re.search(pattern, left_text):
  4300. project_contacts_patterns_res.append([ent, sent_idx, word_begin, prob if not in_att else prob / 2])
  4301. project_contacts_patterns_res = sorted(project_contacts_patterns_res, key=lambda x: (x[3], -x[1], -x[2]),
  4302. reverse=True)
  4303. # print('project_contacts_patterns_res', project_contacts_patterns_res)
  4304. project_contacts_list = []
  4305. phone_set = set()
  4306. have_in_text = False
  4307. if project_contacts_patterns_res:
  4308. for item in project_contacts_patterns_res:
  4309. in_att = item[0].in_attachment
  4310. contacts_person = item[0].entity_text
  4311. contacts_phone = item[0].person_phone[0].entity_text if item[0].person_phone else ""
  4312. if contacts_phone:
  4313. if not in_att:
  4314. have_in_text = True
  4315. if in_att and have_in_text: # 正文已提取,则排除附件的
  4316. break
  4317. if contacts_phone not in phone_set:
  4318. phone_set.add(contacts_phone)
  4319. project_contacts_list.append([contacts_person,contacts_phone])
  4320. return {'project_contacts':project_contacts_list}
  4321. def getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time,winter_scope):
  4322. '''
  4323. @param:
  4324. list_sentence:所有文章的句子list
  4325. list_entity:所有文章的实体list
  4326. @return:list of dict which include文章的包-角色-实体名称-金额-联系人-联系电话
  4327. '''
  4328. result = []
  4329. for list_sentence,list_entity,list_article,list_outline in zip(list_sentences,list_entitys,list_articles,list_outlines):
  4330. RoleList = getPackageRoleMoney(list_sentence,list_entity,list_outline,winter_scope)
  4331. result.append(dict({"prem": RoleList, "docid": list_article.doc_id},
  4332. **getTimeAttributes(list_entity, list_sentence,page_time),
  4333. **getProjectContacts(list_entity, list_sentence),
  4334. **{"fingerprint": list_article.fingerprint,
  4335. "match_enterprise": list_article.match_enterprise,
  4336. "match_enterprise_type": list_article.match_enterprise_type,
  4337. "process_time": getCurrent_date(),
  4338. "attachmentTypes": list_article.attachmentTypes, "bidway": list_article.bidway}))
  4339. # result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity),**getTimeAttributes(list_entity,list_sentence),
  4340. # **{"fingerprint":list_article.fingerprint,"match_enterprise":list_article.match_enterprise,
  4341. # "match_enterprise_type":list_article.match_enterprise_type,"process_time":getCurrent_date(),
  4342. # "attachmentTypes":list_article.attachmentTypes, "bidway": list_article.bidway}))
  4343. return result
  4344. def correct_rolemoney(prem, total_product_money, list_articles): # 2022/9/26修改为 中标金额小于表格单价数量合计总金额十分之一时替换
  4345. '''
  4346. 最后根据表格提取的单价数量合计对比更新中标金额,或中标金额为0全文只有一个总价或合计时,作为中标金额
  4347. :param prem: 列表
  4348. :param total_product_money: 表格统计金额
  4349. :param list_articles: 文章对象
  4350. :return:
  4351. '''
  4352. if '##attachment##' in list_articles[0].content:
  4353. content, attachment = list_articles[0].content.split('##attachment##')
  4354. if len(content) < 200:
  4355. content += attachment
  4356. else:
  4357. content = list_articles[0].content
  4358. if len(re.findall('win_tenderer|second_tenderer|third_tenderer', str(prem[0]['prem'])))==1 and re.search('(中标|成交|合同|投标))?(总?金额|[报总]?价):', content) == None: # 只有一个中标角色且没有明确中标金额表达的
  4359. if total_product_money>0 and total_product_money<5000000000:
  4360. for value in prem[0]['prem'].values():
  4361. ree_money = float(value['tendereeMoney'])
  4362. for l in value['roleList']:
  4363. try:
  4364. # if l[0] == 'win_tenderer' and float(l[2])<total_product_money:
  4365. # l[2] = total_product_money
  4366. # log('修改中标金额为所有产品总金额')
  4367. # if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money']) == 0 and float(l["role_money"]['money'])<total_product_money/10:
  4368. if l["role_name"] == 'win_tenderer' and (float(l["role_money"]['money']) == 0 or (float(l["role_money"]['money'])<ree_money/2 and float(l["role_money"]['money'])<total_product_money<ree_money)): # 改为小于一半招标金额或为0时替换为合计金额
  4369. l["role_money"]['money'] = total_product_money
  4370. # print('修改中标金额为所有产品总金额')
  4371. except Exception as e:
  4372. print('表格产品价格修正中标价格报错:%s'%e)
  4373. elif (len(re.findall('合计', content)) == 1 or len(re.findall('总价', content)) == 1):
  4374. ser = re.search('(?P<header>合计((万?元))?:)(?P<money>[\d,.]+(万?元)?)', content) if len(re.findall('合计', content)) == 1 else re.search('(?P<header>总价((万?元))?:)(?P<money>[\d,.]+(万?元)?)', content)
  4375. if ser:
  4376. money_text = ser.group('money')
  4377. header = ser.group('header')
  4378. money, money_unit = money_process(money_text, header)
  4379. if 100<money<8000000:
  4380. for value in prem[0]['prem'].values():
  4381. for l in value['roleList']:
  4382. try: # 如果原中标金额为0 或 金额小于合计金额0.1倍且正文没中标金额关键词 替换为 合计金额
  4383. if l["role_name"] == 'win_tenderer' and (float(l["role_money"]['money'])==0 or (float(l["role_money"]['money']) < money / 10 and re.search('(中标|成交|合同)(总?金额|[单报总]?价)', content) == None)):
  4384. l["role_money"]['money'] = str(money)
  4385. l["role_money"]['money_unit'] = money_unit
  4386. # print('修改中标金额为总价或合计金额')
  4387. except Exception as e:
  4388. print('修正中标价格报错:%s' % e)
  4389. def limit_maximum_amount(dic, list_entity):
  4390. '''
  4391. 通过关键词、行业、公告类别等设置最高最低角色金额
  4392. :param dic: 最终返回所有字段结果字典
  4393. :param list_entity: 实体列表
  4394. :return:
  4395. '''
  4396. indu_amount = {
  4397. '计算机设备': 200000000,
  4398. '办公设备': 100000000,
  4399. '家具用具': 500000000,
  4400. '办公消耗用品及类似物品': 100000000,
  4401. '日杂用品': 100000000,
  4402. '餐饮业': 1000000000,
  4403. '物业管理': 1000000000,
  4404. '工程技术与设计服务': 1000000000,
  4405. '工程评价服务': 100000000,
  4406. '其他工程服务': 100000000,
  4407. '工程监理服务': 100000000,
  4408. '工程造价服务': 100000000,
  4409. '会计、审计及税务服务': 100000000,
  4410. '其他专业咨询与调查': 100000000
  4411. }
  4412. title = dic.get('doctitle_refine', '')
  4413. name = dic.get('name', '')
  4414. product = ','.join(dic.get('product', []))
  4415. text = "%s;%s;%s"%(title, name, product)
  4416. doctype = dic.get('docchannel', {}).get('doctype', '') # 公告类型
  4417. industry = dic['industry'].get('class_name', '')
  4418. category = dic['industry'].get('class', '') # 行业门类
  4419. moneys = [float(it.entity_text) for it in list_entity if it.entity_type=='money' and re.search('^\d+(\.\d+)?', it.entity_text) and 5000<float(it.entity_text)<5000000]
  4420. maximum_amount = 10000000000
  4421. minximum_amount = 100
  4422. if re.search('监理|造价咨询|设计|勘察|招标代理中介服务|工程审计', text) and re.search('施工|总承包|ppp|PPP', text.replace('施工监理', '监理'))==None:
  4423. # print('监理设计等限额')
  4424. maximum_amount = 1000000000
  4425. minximum_amount = 200
  4426. elif re.search('施工|总承包|ppp|PPP|公路|道路|桥梁|铁路|土地使用权|地块|棚改|征地拆迁|棚户区改造|土地征收|建设用地|社会保险', text) or category in ['金融业', '建筑业'] or doctype == '土地矿产':
  4427. # print('施工、铁路等限额')
  4428. if industry in ['科研、医疗、教育用房', '住宅、商业用房', '场馆、站港用房','工业、生产用房','专业施工']:
  4429. maximum_amount = 20000000000
  4430. minximum_amount = 200
  4431. elif industry in ['修缮工程', '电气安装', '管道和设备安装', '建筑装饰和装修业', '建筑物拆除和场地准备活动']:
  4432. maximum_amount = 10000000000
  4433. minximum_amount = 100
  4434. else:
  4435. maximum_amount = 50000000000
  4436. minximum_amount = 500
  4437. elif re.search('(办公|体育)(用品|设备|器材)|耗材|打印机|复印机|打印纸|粉盒|墨粉|复印纸|网上超市|电子卖场|家电|配电箱采购|配件|备件', text):
  4438. # print('商品采购限额')
  4439. maximum_amount = 80000000
  4440. minximum_amount = 10
  4441. elif re.search('修理|维修|(安保|保安|安全|保洁|物业|后勤|管理|代理|中介|印刷)服务', text):
  4442. # print('维修限额')
  4443. maximum_amount = 50000000
  4444. elif re.search('(速递|快递|邮政|邮寄)(物流)?服务', text):
  4445. # print('快递限额')
  4446. maximum_amount = 80000000
  4447. minximum_amount = 10
  4448. elif industry in indu_amount:
  4449. maximum_amount = indu_amount[industry]
  4450. # print('maximum_amount:', maximum_amount)
  4451. for value in dic['prem'].values():
  4452. for l in value['roleList']:
  4453. if l["role_name"] in ['win_tenderer', 'second_tenderer', 'third_tenderer']:
  4454. # date = float(re.search('(\d+)天', l.get('serviceTime', '')).group(1)) if re.search('(\d+)天', l.get('serviceTime', '')) else 0
  4455. serviceTime_dict = l.get('serviceTime', dict())
  4456. serviceTime_dict = serviceTime_dict if serviceTime_dict else dict()
  4457. date = serviceTime_dict.get("service_days",0)
  4458. if 0 < date < 180 and float(l["role_money"]['money']) > 10000000000: # 工期小于180天且金额大于百亿的,错误
  4459. l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000)
  4460. # print('工期纠正百亿以上金额 ')
  4461. elif float(l["role_money"]['money']) > maximum_amount:
  4462. flag = 1
  4463. for money in moneys:
  4464. if float(l["role_money"]['money'])/money == 10000 and l['role_money']['money_unit'] == '万元':
  4465. l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000)
  4466. # print('万倍关系纠正连接金额')
  4467. flag = 0
  4468. break
  4469. if flag and l["role_money"]['money_unit'] == '万元' or re.search('^\d{11,}(\.0)?$', str(l["role_money"]['money'])):
  4470. l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000)
  4471. # print('行业限额纠正连接金额')
  4472. elif industry in ['餐饮业', '物业管理'] and maximum_amount == indu_amount[industry]:
  4473. l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000)
  4474. # elif flag and l["role_money"]['money_unit'] == '元':
  4475. # l["role_money"]['money'] = 0
  4476. elif 0<float(l["role_money"]['money']) < minximum_amount:
  4477. if l["role_money"]['money_unit'] == '元' and re.search('^\d{1,2}\.\d{4,6}$', str(l["role_money"]['money'])):
  4478. # print('单位元小金额且格式类似万元的乘以万倍')
  4479. l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) * 10000)
  4480. # else: # 20241011 取消小于最低金额改为0 避免小金额不提取 例:520248605
  4481. # # print('中标金额小于限额:%d元 去除' % minximum_amount)
  4482. # l["role_money"]['money'] = 0
  4483. if float(value['tendereeMoney']) > maximum_amount:
  4484. flag = 1
  4485. for money in moneys:
  4486. if float(value['tendereeMoney'])/money == 10000 and l['role_money']['money_unit'] == '万元':
  4487. value['tendereeMoney'] = str(Decimal(value['tendereeMoney'])/10000)
  4488. # print('万倍关系纠正连接金额')
  4489. flag = 0
  4490. break
  4491. if (flag and value['tendereeMoneyUnit'] == '万元' or re.search('^\d{11,}(\.0)?$', str(value['tendereeMoney']))) and float(value['tendereeMoney']) > maximum_amount*100: #2024/5/23 改为单位万元且超过限额100倍才除一万,避免不合理纠正 比如 174255856 项目(系统)一亿变一万
  4492. value['tendereeMoney'] = str(Decimal(value['tendereeMoney']) / 10000)
  4493. # print('行业限额纠正连接金额')
  4494. elif industry in ['餐饮业', '物业管理'] and maximum_amount == indu_amount[industry]:
  4495. value['tendereeMoney'] = str(Decimal(value['tendereeMoney']) / 10000)
  4496. # elif flag and value['tendereeMoneyUnit'] == '元':
  4497. # value['tendereeMoney'] = 0
  4498. elif 0<float(value['tendereeMoney']) < minximum_amount:
  4499. if value['tendereeMoneyUnit'] == '元' and re.search('^\d{1,2}\.\d{4,6}$', str(value['tendereeMoney'])):
  4500. # print('单位元小金额且格式类似万元的乘以万倍')
  4501. value['tendereeMoney'] = str(Decimal(value['tendereeMoney']) * 10000)
  4502. # else: # 20241011 取消小于最低金额改为0 避免小金额不提取 例:520248605
  4503. # # print('招标金额小于限额:%d元 去除' % minximum_amount)
  4504. # value['tendereeMoney'] = 0
  4505. def limit_maximum_amount_backup(prem, industry):
  4506. indu = industry['industry'].get('class_name', '')
  4507. indu_amount = {
  4508. '计算机设备': 200000000,
  4509. '办公设备': 100000000,
  4510. '家具用具': 500000000,
  4511. '办公消耗用品及类似物品': 100000000,
  4512. '日杂用品': 100000000,
  4513. '餐饮业': 1000000000,
  4514. '物业管理': 1000000000,
  4515. '工程技术与设计服务': 1000000000,
  4516. '工程评价服务': 100000000,
  4517. '其他工程服务': 100000000,
  4518. '工程监理服务': 100000000,
  4519. '工程造价服务': 100000000,
  4520. '会计、审计及税务服务': 100000000,
  4521. }
  4522. if indu in indu_amount:
  4523. maximum_amount = indu_amount[indu]
  4524. try:
  4525. for value in prem[0]['prem'].values():
  4526. for l in value['roleList']:
  4527. if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money']) > maximum_amount:
  4528. if indu in ['餐饮业', '物业管理']:
  4529. l["role_money"]['money'] = str(float(l["role_money"]['money'])/10000)
  4530. elif l["role_money"]['money_unit'] == '万元':
  4531. l["role_money"]['money'] = str(float(l["role_money"]['money'])/10000)
  4532. if float(value['tendereeMoney']) > maximum_amount:
  4533. if indu in ['餐饮业', '物业管理']:
  4534. value['tendereeMoney'] = float(value['tendereeMoney'])/10000
  4535. elif value['tendereeMoneyUnit'] == '万元':
  4536. value['tendereeMoney'] = float(value['tendereeMoney']) / 10000
  4537. except Exception as e:
  4538. print('行业分类限制最高金额抛出异常:%s' % e)
  4539. def get_win_joint(prem, list_entitys, list_sentences, list_articles):
  4540. '''
  4541. 获取联合体信息, 添加到prem
  4542. :param prem:
  4543. :param list_entitys:
  4544. :param list_sentences:
  4545. :param list_articles:
  4546. :return:
  4547. '''
  4548. try:
  4549. if 'win_tenderer' in str(prem[0]['prem']) and re.search('联合(体|方|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|(联合(体|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|[,;]成:|(成[),]|与[^,。]{6,100}联合体', list_articles[0].content):
  4550. sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
  4551. for v in prem[0]['prem'].values():
  4552. for d in v['roleList']:
  4553. if d.get('role_name', '') == 'win_tenderer':
  4554. winner = d.get('role_text')
  4555. join_l = [winner]
  4556. for list_entity in list_entitys:
  4557. for i in range(len(list_entity)-1):
  4558. _entity = list_entity[i]
  4559. b = _entity.wordOffset_begin
  4560. e = _entity.wordOffset_end
  4561. if _entity.entity_type in ['org', 'company'] and _entity.label==2\
  4562. and _entity.entity_text==winner:
  4563. s = sentences[_entity.sentence_index].sentence_text
  4564. find_joint = 0 # 是否包含联合体
  4565. for j in range(i+1, len(list_entity)):
  4566. behind_entity = list_entity[j]
  4567. b2 = behind_entity.wordOffset_begin
  4568. e2 = behind_entity.wordOffset_end
  4569. if _entity.sentence_index == behind_entity.sentence_index and behind_entity.entity_type in ['org', 'company'] \
  4570. and b2-e<13 and re.search('联合(体|方|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[,;]成:|(成)$', s[e:b2]) or \
  4571. re.search('(联合(体|方|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|^(成[),]$', s[e2:e2+10]) and behind_entity.label in [2, 5]:
  4572. join_l.append(behind_entity.entity_text)
  4573. b = b2
  4574. e = e2
  4575. find_joint = 1
  4576. elif (find_joint or re.search('与[^,。]{6,100}联合体', list_articles[0].content)) and behind_entity.entity_type in ['org', 'company'] and s[e:b2] in ['与',';','、','&',',','/','//'] and (len(s)==e2 or s[e2] in [';','、','&',',','/','//', '。', ')'] or s[e2:e2+3]=='联合体'):
  4577. join_l.append(behind_entity.entity_text)
  4578. b = b2
  4579. e = e2
  4580. elif e == e2: # 修复重复实体导致中断情况
  4581. continue
  4582. else:
  4583. break
  4584. if len(join_l)>1:
  4585. d['win_tenderer_joint'] = ','.join(set(join_l))
  4586. # behind_entity = list_entity[i + 1]
  4587. # if _entity.sentence_index== behind_entity.sentence_index and _entity.entity_type in ['org', 'company'] and _entity.label==2\
  4588. # and _entity.entity_text==winner and behind_entity.entity_type in ['org', 'company'] and behind_entity.label==5:
  4589. # s = sentences[_entity.sentence_index].sentence_text
  4590. # b = _entity.wordOffset_begin
  4591. # e = _entity.wordOffset_end
  4592. # b2 = behind_entity.wordOffset_begin
  4593. # e2 = behind_entity.wordOffset_end
  4594. # if re.search('(联合体)', s[e2:e2+6]) and b2-e<3:
  4595. # print('联合体:', s[max(0, b-10):e2+10])
  4596. # d['win_tenderer_joint'] = '%s,%s'%(_entity.entity_text, behind_entity.entity_text)
  4597. # break
  4598. # elif re.search('(联合体((牵头|主办)(人|方|单位)|主体)|牵头(人|方|单位))|(联合体)?成员:|特殊普通合伙:', s[e:b2]) and b2-e<10:
  4599. # d['win_tenderer_joint'] = '%s,%s' % (_entity.entity_text, behind_entity.entity_text)
  4600. # print('联合体:', s[max(0, b - 10):e2 + 10])
  4601. # break
  4602. except Exception as e:
  4603. print('获取联合体抛出异常', e)
  4604. def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences, all_winner=False):
  4605. '''
  4606. 获取多中标人及正文、附件所有金额,多中标人multi_winner写入prem,返回金额列表
  4607. :param channel_dic:
  4608. :param prem:
  4609. :param list_entitys:
  4610. :param list_sentences:
  4611. :return:
  4612. '''
  4613. def add_multi_winner(pack_l, winner_l):
  4614. if len(prem[0]['prem']) > 1 and len(set([it[0] for it in pack_l])) > 1: # 多标段多中标人处理
  4615. pk_dic = {}
  4616. for ent in winner_l:
  4617. for i in range(len(pack_l)):
  4618. pk, s1, b1, _ = pack_l[i]
  4619. if ent[1] < s1 or ent[1] == s1 and ent[2] < b1:
  4620. break
  4621. elif (ent[1] > s1 or ent[1] == s1 and ent[2] > b1):
  4622. if i < len(pack_l) - 1:
  4623. pk2, s2, b2, _ = pack_l[i + 1]
  4624. if (ent[1] < s2 or ent[1] == s2 and ent[2] < b2):
  4625. if pk not in pk_dic:
  4626. pk_dic[pk] = set()
  4627. pk_dic[pk].add(ent[0])
  4628. else:
  4629. continue
  4630. else:
  4631. if pk not in pk_dic:
  4632. pk_dic[pk] = set()
  4633. pk_dic[pk].add(ent[0])
  4634. else:
  4635. continue
  4636. for pk, multi_winner in pk_dic.items():
  4637. multi_winner = multi_winner - tenderee_or_agency
  4638. if len(multi_winner) < 2:
  4639. continue
  4640. for k, v in prem[0]['prem'].items():
  4641. if pk == k:
  4642. for d in v['roleList']:
  4643. if d.get('role_name', '') == 'win_tenderer':
  4644. if d.get('role_text', '') in multi_winner and 'multi_winner' not in d:
  4645. d['multi_winner'] = ','.join(set(multi_winner))
  4646. elif 0 < len(prem[0]['prem']) < 3: # 修复 单包多中标人 例:285780273
  4647. multi_winner = set([it[0] for it in winner_l]) - tenderee_or_agency
  4648. if len(multi_winner) > 1:
  4649. for v in prem[0]['prem'].values():
  4650. for d in v['roleList']:
  4651. if d.get('role_name', '') == 'win_tenderer':
  4652. if d.get('role_text', '') in multi_winner and 'multi_winner' not in d:
  4653. d['multi_winner'] = ','.join(set(multi_winner))
  4654. break
  4655. moneys = []
  4656. moneys_attachment = []
  4657. if channel_dic['docchannel']['life_docchannel'] in ['中标信息','候选人公示','合同公告'] and 'win_tenderer' in str(prem):
  4658. sentences = sorted(list_sentences[0], key=lambda x: x.sentence_index)
  4659. finalists = [] # 入围供应商
  4660. multi_winner_l = [] # 保存中标人名称列表
  4661. tenderee_or_agency = set()
  4662. package_l = []
  4663. i = 0
  4664. while i < len(list_entitys[0])-1:
  4665. ent = list_entitys[0][i]
  4666. b_idx_fr = ent.wordOffset_begin
  4667. e_idx_fr = ent.wordOffset_end
  4668. i += 1
  4669. if ent.entity_type in ['money']:
  4670. money = float(ent.entity_text)
  4671. if ent.in_attachment:
  4672. moneys_attachment.append(money)
  4673. else:
  4674. moneys.append(money)
  4675. elif ent.entity_type in ['package']:
  4676. package_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
  4677. elif ent.entity_type in ['org', 'company']:
  4678. sentence_text = sentences[ent.sentence_index].sentence_text
  4679. pre_text = sentence_text[max(0, b_idx_fr - 10):b_idx_fr]
  4680. if ent.label in [0,1] and ent.values[ent.label] > 0.8:
  4681. tenderee_or_agency.add(ent.entity_text)
  4682. elif ent.label == 2 and (ent.values[ent.label] > 0.8 or all_winner):
  4683. multi_winner_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
  4684. for j in range(i, len(list_entitys[0])):
  4685. ent_bh = list_entitys[0][j]
  4686. b_idx_bh = ent_bh.wordOffset_begin
  4687. e_idx_bh = ent_bh.wordOffset_end
  4688. if ent_bh.entity_type in ['org','company'] and ent_bh.label in [2,5] and ent_bh.sentence_index == ent.sentence_index and b_idx_bh - e_idx_fr in [1, 2]:
  4689. if sentence_text[e_idx_fr:b_idx_bh] in [';', '、', '&', ',', '/', '//'] and (
  4690. len(sentence_text) == e_idx_bh or sentence_text[e_idx_bh] in [';', '、', '&', ',','/', '//','。']): # 修复多中标人刚好在文末index超出报错,例子 407126558
  4691. multi_winner_l.append((ent_bh.entity_text, ent_bh.sentence_index, ent_bh.wordOffset_begin, ent_bh.in_attachment))
  4692. e_idx_fr = e_idx_bh
  4693. i = j + 1
  4694. else:
  4695. break
  4696. elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh == e_idx_fr: # 两实体间没符号分割情况
  4697. multi_winner_l.append((ent_bh.entity_text, ent_bh.sentence_index, ent_bh.wordOffset_begin, ent_bh.in_attachment))
  4698. e_idx_fr = e_idx_bh
  4699. i = j + 1
  4700. elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and e_idx_fr == e_idx_bh: # 处理 514603520 中国邮政储蓄银行股份有限公司淄博市临淄区支行 实体由于字典匹配重复两次情况
  4701. i = j + 1
  4702. else:
  4703. break
  4704. if re.search('入围', pre_text) and re.search('未入围', pre_text)==None:
  4705. finalists.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
  4706. elif all_winner==1 and ent.label in [3,4,5] and re.search('第[一二三四五六七八九十0-9]+名|候选(人|单位)|入围(单位|供应商)|投标银行', pre_text) and re.search('未', pre_text)==None:
  4707. multi_winner_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
  4708. if len(multi_winner_l)>=2:
  4709. winner_main = [it for it in multi_winner_l if not it[3]]
  4710. winner_attn = [it for it in multi_winner_l if it[3]]
  4711. pack_main = [it for it in package_l if not it[3]]
  4712. pack_attn = [it for it in package_l if it[3]]
  4713. if len(set([it[0] for it in winner_main]))>=2: # 有两个及以上多中标人及多标段 例:441612746
  4714. add_multi_winner(pack_main, winner_main)
  4715. elif len(set([it[0] for it in winner_attn]))>=2:
  4716. add_multi_winner(pack_attn, winner_attn)
  4717. if len(finalists)>=2: # 多入围候选人
  4718. winner_main = [it for it in finalists if not it[3]]
  4719. winner_attn = [it for it in finalists if it[3]]
  4720. pack_main = [it for it in package_l if not it[3]]
  4721. pack_attn = [it for it in package_l if it[3]]
  4722. if len(set([it[0] for it in winner_main]))>=2: # 有两个及以上多中标人及多标段 例:276326152
  4723. add_multi_winner(pack_main, winner_main)
  4724. elif len(set([it[0] for it in winner_attn]))>=2:
  4725. add_multi_winner(pack_attn, winner_attn)
  4726. else:
  4727. for i in range(len(list_entitys[0])):
  4728. ent = list_entitys[0][i]
  4729. if ent.entity_type in ['money']:
  4730. money = float(ent.entity_text)
  4731. if ent.in_attachment:
  4732. moneys_attachment.append(money)
  4733. else:
  4734. moneys.append(money)
  4735. return {'moneys': list(set(moneys)), 'moneys_attachment': list(set(moneys_attachment))}
  4736. def update_prem(old_prem, new_prem, in_attachment=False):
  4737. '''
  4738. 根据新旧对比,更新数据
  4739. :param old_prem:
  4740. :param new_prem: 表格提取的要素
  4741. :return:
  4742. '''
  4743. if len(new_prem) >= 1 :
  4744. '''如果表格提取的包大于2,原来的包比表格提取的包多则删除原来多余的包,以表格的为准 20250528补充条件表格提取需有中标人避免 626464296 替换错误'''
  4745. if len(new_prem) >= 2 and 'win_tenderer' in json.dumps(new_prem) and (len(new_prem)<len(old_prem) <= len(new_prem)*2 or set(old_prem)&set(new_prem)==set()): # 修复类似443925411 标的+标包才算标段号
  4746. del_k = []
  4747. for k in old_prem:
  4748. if k not in new_prem and k != 'Project':
  4749. del_k.append(k)
  4750. for k in del_k:
  4751. old_prem.pop(k)
  4752. if len(old_prem) > len(new_prem) and len(new_prem)>1 and in_attachment==False: # 如果表格有提取,非表格包数比表格提取多,去掉非表格在附件里提取的包
  4753. del_k = []
  4754. for k in old_prem:
  4755. if 'in_attachment' in old_prem[k] and old_prem[k]['in_attachment'] and k not in new_prem and k != 'Project':
  4756. del_k.append(k)
  4757. for k in del_k:
  4758. old_prem.pop(k)
  4759. if in_attachment: # 附件表格提取的,原来提取有中标人,停止替换
  4760. for v in old_prem.values():
  4761. for d in v['roleList']:
  4762. if d['role_name'] in ['win_tenderer', 'pre_win_tenderer']:
  4763. return 0
  4764. # if len(new_prem) > len(old_prem) and [k for k in new_prem if '自增' not in k] == []: # 如果表格提取包号都为自增编号且包数大于非表格提取,不进行更新 例 244355092 281854766
  4765. # return None
  4766. if len(old_prem) == 2 and len(new_prem) == 1 and ('Project' in new_prem or set(new_prem)&set(old_prem)==set()): # 如果表格提取包为Project,非表格提取两个包且一个包为Project,把表格提取合并到非Project包
  4767. k = list(old_prem.keys()-set(['Project']))[0]
  4768. k_new = list(new_prem.keys())[0]
  4769. new_prem[k] = new_prem.pop(k_new)
  4770. elif len(old_prem) == 1 and len(new_prem) == 1 and 'Project' not in old_prem and set(new_prem)&set(old_prem)==set(): # 如果表格提取包与非表格提取都是一个包且不同,把表格提取包名替换为非表格包名
  4771. k = list(old_prem.keys()-set(['Project']))[0]
  4772. k_new = list(new_prem.keys())[0]
  4773. new_prem[k] = new_prem.pop(k_new)
  4774. if len(new_prem) == len(old_prem) == 1 and 'Project' not in new_prem and 'Project' in old_prem: # 如果表格提取到包号,非表格没提取到,合并到Project
  4775. k = list(new_prem.keys())[0]
  4776. new_prem['Project'] = new_prem[k]
  4777. multi_tendereeMoney = [] # 多包招标金额
  4778. for k, v in new_prem.items():
  4779. if k == 'Project':
  4780. if 'Project' in old_prem:
  4781. tmp_l = [] # 保存新旧同时包含的角色
  4782. if v.get('code', "") != "":
  4783. old_prem['Project']['code'] = v.get('code', "")
  4784. if v.get('name', "") != "":
  4785. old_prem['Project']['name'] = v.get('name', "")
  4786. for d in old_prem['Project']['roleList']:
  4787. for d2 in v['roleList']:
  4788. if d['role_name'] == d2['role_name']: # 同时包含的角色用表格的替换
  4789. tmp_l.append(d2)
  4790. if d2['role_text'] != "":
  4791. d['role_text'] = d2['role_text']
  4792. if d2['serviceTime'] != "":
  4793. d['serviceTime'] = d2['serviceTime']
  4794. if float(d2['role_money']['money']) != 0: # 如果表格提取的金额不为0才替换
  4795. d['role_money']['money'] = d2['role_money']['money']
  4796. d['role_money']['money_unit'] = d2['role_money']['money_unit']
  4797. for k in set(d2)-set(d): # 把表格提取加的属性补充过来,比如:multi_winner other_winner_dic等
  4798. if d2[k]:
  4799. d[k] = d2[k]
  4800. for d2 in v['roleList']:
  4801. if d2 not in tmp_l: # 把新预测有,旧没有的角色添加上去
  4802. old_prem['Project']['roleList'].append(d2)
  4803. if float(new_prem['Project']['tendereeMoney'])!=0:
  4804. old_prem['Project']['tendereeMoney'] = new_prem['Project']['tendereeMoney'] # 20240508 修复 464187225 表格提取纠正招标金额错误
  4805. else:
  4806. old_prem[k] = v
  4807. else:
  4808. if v['tendereeMoney'] != 0:
  4809. multi_tendereeMoney.append(v['tendereeMoney'])
  4810. if k.startswith('自增'): # 表格提取的没找到包号 按行数添加包号,前面加自增,例 自增1
  4811. k = k[2:]
  4812. if k not in old_prem: # 新有旧没有的包直接添加
  4813. old_prem[k] = v
  4814. else:
  4815. tmp_l = [] # 保存新旧同时包含的角色
  4816. if v.get('code', "") != "":
  4817. old_prem[k]['code'] = v.get('code', "")
  4818. if v.get('name', "") != "":
  4819. old_prem[k]['name'] = v.get('name', "")
  4820. for d in old_prem[k]['roleList']:
  4821. for d2 in v['roleList']:
  4822. if d['role_name'] == d2['role_name']:
  4823. tmp_l.append(d2)
  4824. if d2['role_text'] != "":
  4825. d['role_text'] = d2['role_text']
  4826. if d2['serviceTime'] != "":
  4827. d['serviceTime'] = d2['serviceTime']
  4828. if float(d2['role_money']['money']) != 0: # 如果表格提取的金额不为0才替换
  4829. d['role_money']['money'] = d2['role_money']['money']
  4830. d['role_money']['money_unit'] = d2['role_money']['money_unit']
  4831. for k2 in set(d2)-set(d): # 把表格提取加的属性补充过来,比如:multi_winner other_winner_dic等
  4832. if d2[k2]:
  4833. d[k2] = d2[k2]
  4834. for d2 in v['roleList']:
  4835. if d2 not in tmp_l: # 把新预测有,旧没有的角色添加上去
  4836. old_prem[k]['roleList'].append(d2)
  4837. if v['tendereeMoney'] != 0:
  4838. old_prem[k]['tendereeMoney'] = v['tendereeMoney'] # 2024/05/24 使用表格招标金额
  4839. if multi_tendereeMoney and 'Project' in old_prem and float(old_prem['Project']['tendereeMoney'])!=0: # 表格提取到多标段招标金额,去掉Project包招标金额
  4840. old_prem['Project']['tendereeMoney'] = 0
  4841. tenderee_l = [d2['role_text'] for v in old_prem.values() for d2 in v['roleList'] if d2['role_name']=='tenderee']
  4842. winner_l = [d2['role_text'] for v in old_prem.values() for d2 in v['roleList'] if d2['role_name']=='win_tenderer']
  4843. if set(tenderee_l) & set(winner_l): # 删除与中标人冲突的招标人
  4844. for k in old_prem:
  4845. old_prem[k]['roleList'] = [d for d in old_prem[k]['roleList'] if
  4846. not(d['role_name'] == 'tenderee' and d['role_text'] in winner_l)]
  4847. # print('删除与中标人冲突的招标人')
  4848. # return old_prem
  4849. def rule_add_role(docid, prem, channel, content, web_source_no, nlp_enterprise):
  4850. def add_role(ent_name, role_type, prem):
  4851. if 'Project' in prem:
  4852. prem['Project']['roleList'].append(
  4853. {
  4854. "address": "",
  4855. "linklist": [],
  4856. "role_money": {
  4857. "discount_ratio": "",
  4858. "downward_floating_ratio": "",
  4859. "floating_ratio": "",
  4860. "money": 0,
  4861. "money_unit": ""
  4862. },
  4863. "role_name": role_type,
  4864. "role_prob": 0.6,
  4865. "role_text": ent_name,
  4866. "rule_add_role": True,
  4867. "serviceTime": ""
  4868. }
  4869. )
  4870. else:
  4871. prem['Project'] = {
  4872. "code": "",
  4873. "name": "",
  4874. "roleList": [
  4875. {
  4876. "address": "",
  4877. "linklist": [
  4878. ],
  4879. "role_money": {
  4880. "discount_ratio": "",
  4881. "downward_floating_ratio": "",
  4882. "floating_ratio": "",
  4883. "money": 0,
  4884. "money_unit": ""
  4885. },
  4886. "role_name": role_type,
  4887. "role_prob": 0.6,
  4888. "role_text": ent_name,
  4889. "rule_add_role": True,
  4890. "serviceTime": ""
  4891. }
  4892. ],
  4893. "tendereeMoney": 0,
  4894. "tendereeMoneyUnit": "",
  4895. "uuid": str(uuid.uuid4())
  4896. }
  4897. if channel['docchannel']['docchannel'] == '招标公告' and re.search('"role_name": "tenderee"',json.dumps(prem)) == None:
  4898. match = re.search('(招标|采购|招商)(人|商|单位|部门)(信息[,:]?)?(名称)?((甲方))?:(?P<name>[\w()—-]{4,35})([,。]|$)', content)
  4899. if match:
  4900. ent_name = match.group('name')
  4901. if re.search('测试|演示|某|\d号|\*|XX', ent_name)==None and re.search('^\w{1,5}[省市县区][\w()]{2,25}[厂店铺市场行部城室馆中心站处社会狱所园关局司署段厅院队小学]((个体工商户)?|(普通合伙)?)?$',
  4902. ent_name): # or is_enterprise_exist(ent_name)
  4903. log('规则补充招标人角色:%s,docid:%s'%(ent_name, docid))
  4904. add_role(ent_name, "tenderee", prem)
  4905. elif web_source_no == 'DX000752' and len(nlp_enterprise)==1 and re.search('更多信息点击报价地址', content): # 修复 628311260
  4906. ent_name = nlp_enterprise[0]
  4907. add_role(ent_name, "tenderee", prem)
  4908. elif channel['docchannel']['docchannel'] == '中标信息' and re.search('"role_name": "win_tenderer"',json.dumps(prem)) == None:
  4909. match = re.search('((中标|中选|成交))?(人|方|供应商|服务商|单位|部门)|(拟定|[,。])供应商)(信息[,:]?)?(名称)?((乙方))?:(?P<name>[\w()—-]{4,35})([,。]|$)',content)
  4910. if match:
  4911. ent_name = match.group('name')
  4912. if re.search('测试|演示|某|\d号|\*|XX', ent_name)==None and re.search('^\w{1,5}[省市县区][\w()]{2,25}[厂店铺市场行部城室馆中心站处社会狱所园关局司署段厅院队小学]((个体工商户)?|(普通合伙)?)?$',
  4913. ent_name): # or is_enterprise_exist(ent_name)
  4914. log('规则补充中标人角色:%s,docid:%s'%(ent_name, docid))
  4915. add_role(ent_name, "win_tenderer", prem)
  4916. def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMoney=0):
  4917. '''
  4918. 规则检查纠正prem,如果Project包中标人在其他包中标人,去掉project包中标角色;如果有其他包中标人,去掉roleList为空的包;
  4919. :param prem: prem 字段字典
  4920. :return:
  4921. '''
  4922. if len(prem) > 1: # 表格提取到中标人的,去掉project包中标人
  4923. pro_winner = set()
  4924. other_winner = set()
  4925. other_winner_prob = 0
  4926. pro_winner_prob = 0
  4927. empty_roleList = []
  4928. for k in prem:
  4929. prem[k]['uuid'] = str(uuid.uuid4()) # 20240627 每个包都添加uuid
  4930. if prem[k]['roleList'] == []:
  4931. empty_roleList.append(k)
  4932. for d in prem[k]['roleList']:
  4933. if d['role_name'] in ['win_tenderer', 'pre_win_tenderer', 'second_tenderer','third_tenderer']:
  4934. if k == 'Project':
  4935. pro_winner.add(d['role_text'])
  4936. if 'win_tenderer_joint' in d:
  4937. pro_winner.update(set(d['win_tenderer_joint'].split(',')))
  4938. if 'multi_winner' in d:
  4939. pro_winner.update(set(d['multi_winner'].split(',')))
  4940. if d['role_name'] == 'win_tenderer' and d.get('role_prob', 0)>0.6:
  4941. pro_winner_prob = d.get('role_prob', 0)
  4942. else:
  4943. other_winner.add(d['role_text'])
  4944. if 'win_tenderer_joint' in d:
  4945. other_winner.update(set(d['win_tenderer_joint'].split(',')))
  4946. if 'multi_winner' in d:
  4947. other_winner.update(set(d['multi_winner'].split(',')))
  4948. if d['role_name'] == 'win_tenderer' and d.get('role_prob', 0)>0.6:
  4949. other_winner_prob = d.get('role_prob', 0)
  4950. if pro_winner!=set() and (pro_winner & other_winner != set() or other_winner_prob>pro_winner_prob): # 如果默认包与其他包中标人重复或其他包中标人概率比默认包大,删除默认包中标人
  4951. prem['Project']['roleList'] = [d for d in prem['Project']['roleList'] if
  4952. d['role_name'] not in ['win_tenderer', 'second_tenderer',
  4953. 'third_tenderer']]
  4954. elif other_winner_prob<pro_winner_prob and len(prem)==2: # 两个包情况,如果默认包中标人概率比其他包大,删除其他包
  4955. rm_k = [k for k in prem if k != 'Project']
  4956. for k in rm_k:
  4957. prem.pop(k)
  4958. if other_winner and channel_dic['docchannel']['docchannel'] in ['中标信息', '候选人公示', '合同公告']:
  4959. for k in empty_roleList:
  4960. prem.pop(k)
  4961. elif "Project" in prem:
  4962. prem['Project']['uuid'] = str(uuid.uuid4())
  4963. if len(prem) == 2:
  4964. del_k = [k for k,v in prem.items() if v.get('roleList', [])==[] and v.get('tendereeMoney', 0)==0 and v.get('unit_tendereeMoney', 0)==0] # 20250310 删除掉没有角色且招标金额为0 的包
  4965. for k in del_k:
  4966. prem.pop(k)
  4967. # print('删除掉没有角色且招标金额为0 的包', k)
  4968. if is_deposit_project and float(total_tendereeMoney)!=0 and len(prem)==1: #20241107 存款类项目有总投资没招标金额且只有一个标段,把总投资作招标金额
  4969. for k in prem:
  4970. if float(prem[k]['tendereeMoney'])==0:
  4971. prem[k]['tendereeMoney'] = total_tendereeMoney
  4972. def add_package_name(prem, list_entity, product_list, name):
  4973. '''
  4974. 通过产品、项目名称,补充各标段包名,如果标段无包名,标段后紧接产品,把产品作为包名;如果标段数少于等于2且包名为空,补充项目名称为包名
  4975. :param prem:
  4976. :param list_entity:
  4977. :param product_list:
  4978. :param name:
  4979. :return:
  4980. '''
  4981. if len(prem)>2 and len(product_list)>2:
  4982. ent_l = []
  4983. for entity in list_entity:
  4984. if entity.entity_type in ['product', 'package']:
  4985. ent_l.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end, entity.in_attachment))
  4986. ent_l.sort(key=lambda x: [x[2],x[3]])
  4987. i = 0
  4988. pk_dic = {}
  4989. while i < len(ent_l)-1:
  4990. ty1, ent1, s1, b1, e1, in_att1 = ent_l[i]
  4991. ty2, ent2, s2, b2, e2, in_att2 = ent_l[i+1]
  4992. if in_att1 == in_att2 and ty1 == 'package' and ty2 == 'product' and s1 == s2 and 0<b2-e1<3:
  4993. pk_dic[ent1] = ent2
  4994. i += 1
  4995. if len(pk_dic) > 1:
  4996. for k, v in prem.items():
  4997. if k in pk_dic and v.get('name', '') == '':
  4998. v['name'] = pk_dic[k]
  4999. elif name != '' and len(prem)<=2: # 20241129 小于等于两个包且无包名称,取项目名称
  5000. for k in prem:
  5001. if prem[k].get('name', '') == '':
  5002. prem[k]['name'] = name
  5003. def fix_single_source(prem, channel_dic, original_docchannel):
  5004. if prem.get('bidway', '') == '单一来源' and channel_dic['docchannel']['docchannel'] == '招标公告' and original_docchannel==52:
  5005. for l in prem['prem'].values():
  5006. for d in l['roleList']:
  5007. if d['role_name'] == "win_tenderer":
  5008. d['role_name'] = 'pre_win_tenderer'
  5009. def demand_to_prem(demand, prem):
  5010. if len(demand.get('data', [])) > len(prem):
  5011. i = 1
  5012. for d in demand.get('data', []):
  5013. d['demand_id'] = i
  5014. if d.get('project_name', '') != '' and d.get('budget', '') != '':
  5015. if d.get('project_name', '') not in prem:
  5016. prem[d.get('project_name', '')] = {
  5017. 'demand_id': i,
  5018. 'code': '',
  5019. 'name': d.get('project_name', ''),
  5020. 'roleList': [],
  5021. 'tendereeMoney': d.get('budget', ''),
  5022. 'tendereeMoneyUnit': ""
  5023. }
  5024. else:
  5025. prem[d.get('project_name', '')+'_%d'%i] = {
  5026. 'demand_id': i,
  5027. 'code': '',
  5028. 'name': d.get('project_name', ''),
  5029. 'roleList': [],
  5030. 'tendereeMoney': d.get('budget', ''),
  5031. 'tendereeMoneyUnit': ""
  5032. }
  5033. i += 1
  5034. if __name__=="__main__":
  5035. '''
  5036. conn = getConnection()
  5037. cursor = conn.cursor()
  5038. #sql = " select distinct A.doc_id from entity_mention A,test_predict_role B where A.entity_id=B.entity_id limit 200"
  5039. sql = " select B.doc_id,B.prem from articles_processed A, articles_validation B where A.id=B.doc_id "
  5040. result = []
  5041. cursor.execute(sql)
  5042. rows = cursor.fetchall()
  5043. count = 0
  5044. for row in rows:
  5045. count += 1
  5046. # print(count)
  5047. doc_id = row[0]
  5048. roleList = getPackageRoleMoney(doc_id)
  5049. result.append([doc_id,str(roleList),row[1]])
  5050. ''''''
  5051. with codecs.open("getAttribute.html","w",encoding="utf8") as f:
  5052. f.write('<html><head>\
  5053. <meta http-equiv="Content-Type"\
  5054. content="text/html; charset=UTF-8">\
  5055. </head>\
  5056. <body bgcolor="#FFFFFF">\
  5057. <table border="1">\
  5058. <tr>\
  5059. <td>doc_id</td>\
  5060. <td>角色</td>\
  5061. </tr>')
  5062. for item in result:
  5063. f.write("<tr>"+"<td>"+item[0]+"</td>"+"<td>"+item[1]+"</td>"+"<td>"+item[2]+"</td>"+"</tr>")
  5064. f.write("</table></body>")
  5065. '''