1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903 |
- # from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date,API_URL,uniform_package_name,money_process,getDigitsDic,isValidDate
- from BiddingKG.dl.common.Utils import *
- from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
- from decimal import Decimal
- import re
- import copy
- import math
- import pandas as pd
- import os
- from scipy.optimize import linear_sum_assignment
- from BiddingKG.dl.interface.Entitys import Match
- import numpy as np
- import uuid
- import time,calendar
- from datetime import datetime
- def getTheRole(entity,role_list):
- '''
- @summary:根据实体名称拿到index
- @param:
- entity:实体名称
- role_list:角色list
- @return:该实体所在下标
- '''
- for role_index in range(len(role_list)):
- if entity in role_list[role_index]:
- return role_index
- return None
- dict_role_id = {"0":"tenderee",
- "1":"agency",
- "2":"win_tenderer",
- "3":"second_tenderer",
- "4":"third_tenderer"}
- role2id_dict = {"tenderee":0,
- "agency":1,
- "win_tenderer":2,
- "second_tenderer":3,
- "third_tenderer":4}
- def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None):
- '''
- @param:
- packageList:文章的包的信息,包号-sent_index-词偏移-字偏移-[[前作用域句子,句内偏移],[后作用域句子,句内偏移]]-匹配集合
- sentence_index:实体所在的句子
- begin_index:实体所在句子的起始位置
- @return:公司实体所属的包
- @summary: 优化多标段,确定标段作用域之后,寻找作用域包含该实体的所有包,从前往后找到一个还没有该roleid的包返回,若找到的包都有roleid,则返回第一个,若没有找到包,返回None
- '''
-
- '''
- if len(packageList)==0:
- return None
- before_index = None
- after_index = None
- equal_index = None
- equal_count = 0
-
-
- for pack_index in range(len(packageList)):
- if packageList[pack_index][1]>sentence_index and after_index is None:
- after_index = pack_index
- if packageList[pack_index][1]<sentence_index:
- before_index = pack_index
- if packageList[pack_index][1]==sentence_index and equal_index is None:
- equal_index = pack_index
- #当前句子和之前句子未找到包
- if before_index is None and equal_index is None:
- return None
- else:
- if after_index is None:
- end_index = len(packageList)
- else:
- end_index = after_index
- #只在当前句子找到一个包号
- if end_index-max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1))==1:
- return packageList[end_index-1][0]
- else:
- for i in range(max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1)),end_index):
- if packageList[i][2]>int(begin_index):
- if packageList[i-1][4]:
- return packageList[i-1][0]
- else:
- if packageList[i][4]:
- return packageList[i-1][0]
- else:
- return packageList[i][0]
- return packageList[end_index-1][0]
- '''
- if len(packageList)==0:
- return None,False
- list_legalPack = []
- for pack_index in range(len(packageList)):
- if DIRECT=="L" and (packageList[pack_index]["sentence_index"]>sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetWords_begin"]>begin_index)):
- continue
- if DIRECT=="R" and (packageList[pack_index]["sentence_index"]<sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetwords_begin"]<begin_index)):
- continue
- if (packageList[pack_index]["scope"][0][0]<sentence_index or (packageList[pack_index]["scope"][0][0]==sentence_index and packageList[pack_index]["scope"][0][1]<=begin_index)) and (packageList[pack_index]["scope"][1][0]>sentence_index or (packageList[pack_index]["scope"][1][0]==sentence_index and packageList[pack_index]["scope"][1][1]>=begin_index)):
- if MAX_DIS is not None:
- if abs(sentence_index-packageList[pack_index]["sentence_index"])<=MAX_DIS:
- list_legalPack.append(pack_index)
- else:
- list_legalPack.append(pack_index)
- # if (packageList[pack_index]["scope"][0][0] < sentence_index
- # or (packageList[pack_index]["scope"][0][0] == sentence_index
- # and packageList[pack_index]["scope"][0][1] <= begin_index))
- # and (packageList[pack_index]["scope"][1][0] > sentence_index
- # or (packageList[pack_index]["scope"][1][0] == sentence_index
- # and packageList[pack_index]["scope"][1][1] >= begin_index)):
- # pass
- _flag = True
- for _index in list_legalPack:
- if roleid in packageList[_index]["hit"]:
- continue
- else:
- _flag = False
- packageList[_index]["hit"].add(roleid)
- return packageList[_index]["pointer"],_flag
- if len(list_legalPack)>0:
- return packageList[0]["pointer"],_flag
- return None,False
- #生成合法的组合
- def get_legal_comba(list_entity,dict_role_combination):
-
- #拿到一个包中所有合法的组合
- def circle_package(_dict_legal_combination):
- list_dict_role_first = []
- for _role in _dict_legal_combination:
- if len(list_dict_role_first)==0:
- for _entity in _dict_legal_combination[_role]:
- if _entity !="":
- list_dict_role_first.append({_role:_entity})
- else:
- list_dict_role_after = []
- _find_count = 0
- for _entity in _dict_legal_combination[_role]:
- if _entity !="":
- for _dict in list_dict_role_first:
- _flag = True
- for _key1 in _dict:
- if _entity==_dict[_key1]:
- #修改为招标人和代理人可以为同一个
- if str(_key1) in ["0","1"] and str(_role) in ["0","1"]:
- _flag = True
- else:
- _flag = False
- if _flag:
- _find_count += 1
- _new_dict = copy.copy(_dict)
- _new_dict[_role] = _entity
- if len(list_dict_role_after)>100000:
- break
- list_dict_role_after.append(_new_dict)
- else:
- # 2021/5/25 update,同一实体(entity_text)不同角色
- if len(list_dict_role_after) > 100000:
- break
- for _dict in list_dict_role_first:
- for _key1 in _dict:
- if _entity == _dict[_key1]:
- _new_dict = copy.copy(_dict)
- _new_dict.pop(_key1)
- _new_dict[_role] = _entity
- list_dict_role_after.append({_role:_entity})
- if len(list_dict_role_after)==0:
- pass
- else:
- list_dict_role_first.extend(list_dict_role_after)
- return list_dict_role_first
- def recursive_package(_dict_legal_combination,set_legal_entity,dict_one_selution,list_all_selution):
- last_layer = False
- #若是空组合则放回空
- if len(_dict_legal_combination.keys())==0:
- return []
- #递归到最后一层则修改状态
- if len(_dict_legal_combination.keys())==1:
- last_layer = True
- #取一个角色开始进行遍历
- _key_role = list(_dict_legal_combination.keys())[0]
- for item in _dict_legal_combination[_key_role]:
- copy_dict_one_selution = copy.copy(dict_one_selution)
- copy_dict_legal_combination = {}
- copy_set_legal_entity = copy.copy(set_legal_entity)
-
- #复制余下的所有角色,进行下一轮递归
- for _key in _dict_legal_combination.keys():
- if _key!=_key_role:
- copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
- #修改为招标人和代理人可以为同一个
- if item !="":
- _flag = True
- if str(_key_role) in ["0","1"]:
- for _key_flag in copy_dict_one_selution:
- if _key_flag not in ["0","1"] and copy_dict_one_selution[_key_flag]==item:
- _flag = False
- else:
- for _key_flag in copy_dict_one_selution:
- if copy_dict_one_selution[_key_flag]==item:
- _flag = False
- if _flag:
- copy_dict_one_selution[_key_role] = item
-
- '''
- if item not in copy_set_legal_entity:
- if item !="":
- copy_dict_one_selution[_key_role] = item
- '''
- copy_set_legal_entity.add(item)
- if last_layer:
- list_all_selution.append(copy_dict_one_selution)
- else:
- recursive_package(copy_dict_legal_combination,copy_set_legal_entity,copy_dict_one_selution,list_all_selution)
-
- #递归匹配各个包的结果
- def recursive_packages(_dict_legal_combination,dict_one_selution,list_all_selution):
- last_layer = False
- if len(_dict_legal_combination.keys())==0:
- return []
- if len(_dict_legal_combination.keys())==1:
- last_layer = True
- _key_pack = list(_dict_legal_combination.keys())[0]
- for item in _dict_legal_combination[_key_pack]:
- copy_dict_one_selution = copy.copy(dict_one_selution)
- copy_dict_legal_combination = {}
- for _key in _dict_legal_combination.keys():
- if _key!=_key_pack:
- copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
- for _key_role in item.keys():
- copy_dict_one_selution[_key_pack+"$$"+_key_role] = item[_key_role]
- if last_layer:
- list_all_selution.append(copy_dict_one_selution)
- else:
- recursive_packages(copy_dict_legal_combination,copy_dict_one_selution,list_all_selution)
- return list_all_selution
-
- #循环获取所有包组合
- def circle_pageages(_dict_legal_combination):
- list_all_selution = []
- for _key_pack in _dict_legal_combination.keys():
- list_key_selution = []
- for item in _dict_legal_combination[_key_pack]:
- _dict = dict()
- for _key_role in item.keys():
- _dict[_key_pack+"$$"+_key_role] = item[_key_role]
- list_key_selution.append(_dict)
- if len(list_all_selution)==0:
- list_all_selution = list_key_selution
- else:
- _list_all_selution = []
- for item_1 in list_all_selution:
- for item_2 in list_key_selution:
- _list_all_selution.append(dict(item_1,**item_2))
- list_all_selution = _list_all_selution
- return list_all_selution
-
- #拿到各个包解析之后的结果
- _dict_legal_combination = {}
- for packageName in dict_role_combination.keys():
- _list_all_selution = []
- # recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution)
- _list_all_selution = circle_package(dict_role_combination[packageName])
- '''
- # print("===1")
- # print(packageName)
- for item in _list_all_selution:
- # print(item)
- # print("===2")
- '''
- #去除包含子集
- list_all_selution_simple = []
- _list_set_all_selution = []
- for item_selution in _list_all_selution:
- item_set_selution = set()
- for _key in item_selution.keys():
- item_set_selution.add((_key,item_selution[_key]))
- _list_set_all_selution.append(item_set_selution)
- if len(_list_set_all_selution)>1000:
- _dict_legal_combination[packageName] = _list_all_selution
- continue
- for i in range(len(_list_set_all_selution)):
-
- be_included = False
- for j in range(len(_list_set_all_selution)):
- if i!=j:
- if len(set(_list_set_all_selution[i])&set(_list_set_all_selution[j]))==len(_list_set_all_selution[i]) and len(_list_set_all_selution[i])!=len(_list_set_all_selution[j]):
- be_included = True
- if not be_included:
- list_all_selution_simple.append(_list_all_selution[i])
- _dict_legal_combination[packageName] = list_all_selution_simple
- _list_final_comba = []
- #对各个包的结果进行排列组合
- _comba_count = 1
- for _key in _dict_legal_combination.keys():
- _comba_count *= len(_dict_legal_combination[_key])
- #如果过大,则每个包只取概率最大的那个
- dict_pack_entity_prob = get_dict_entity_prob(list_entity)
- if _comba_count>250:
- new_dict_legal_combination = dict()
- for _key_pack in _dict_legal_combination.keys():
- MAX_PROB = -1000
- _MAX_PROB_COMBA = None
- for item in _dict_legal_combination[_key_pack]:
- # print(_key_pack,item)
- _dict = dict()
- for _key in item.keys():
- _dict[str(_key_pack)+"$$"+str(_key)] = item[_key]
- _prob = getSumExpectation(dict_pack_entity_prob, _dict)
- if _prob>MAX_PROB:
- MAX_PROB = _prob
- _MAX_PROB_COMBA = [item]
- if _MAX_PROB_COMBA is not None:
- new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA
- _dict_legal_combination = new_dict_legal_combination
- #recursive_packages(_dict_legal_combination, {}, _list_final_comba)
- _list_final_comba = circle_pageages(_dict_legal_combination)
- #除了Project包(招标人和代理人),其他包是不会有冲突的
- #查看是否有一个实体出现在了Project包和其他包中,如有,要进行裁剪
- _list_real_comba = []
- for dict_item in _list_final_comba:
- set_project = set()
- set_other = set()
- for _key in list(dict_item.keys()):
- if _key.split("$$")[0]=="Project":
- set_project.add(dict_item[_key])
- else:
- set_other.add(dict_item[_key])
- set_common = set_project&set_other
- if len(set_common)>0:
- dict_project = {}
- dict_not_project = {}
- for _key in list(dict_item.keys()):
- if dict_item[_key] in set_common:
- if str(_key.split("$$")[0])=="Project":
- dict_project[_key] = dict_item[_key]
- else:
- dict_not_project[_key] = dict_item[_key]
- else:
- dict_project[_key] = dict_item[_key]
- dict_not_project[_key] = dict_item[_key]
-
- _list_real_comba.append(dict_project)
- _list_real_comba.append(dict_not_project)
- else:
- _list_real_comba.append(dict_item)
- return _list_real_comba
- def get_dict_entity_prob(list_entity,on_value=0.5):
- dict_pack_entity_prob = {}
- for in_attachment in [False,True]:
- identified_role = []
- if in_attachment==True:
- identified_role = [value[0] for value in dict_pack_entity_prob.values()]
- for entity in list_entity:
- if entity.entity_type in ['org','company'] and entity.in_attachment==in_attachment:
- values = entity.values
- role_prob = float(values[int(entity.label)])
- _key = entity.packageName+"$$"+str(entity.label)
- if role_prob>=on_value and str(entity.label)!="5":
- _key_prob = _key+"$text$"+entity.entity_text
- if in_attachment == True:
- role_prob = 0.8 if role_prob>0.8 else role_prob #附件的概率修改低点
- # if entity.entity_text in identified_role: # 2023/7/3 注释掉,选取概率最大的作为连接概率
- # continue
- if _key_prob in dict_pack_entity_prob:
- # new_prob = role_prob+dict_pack_entity_prob[_key_prob][1] if role_prob>0.9 else max(role_prob, dict_pack_entity_prob[_key_prob][1])
- # dict_pack_entity_prob[_key_prob] = [entity.entity_text, new_prob] #公司同角色多次出现概率累计
- if role_prob>dict_pack_entity_prob[_key_prob][1]:
- dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
- else:
- dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
- return dict_pack_entity_prob
- #计算合计期望
- def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5):
- '''
- expect = 0
- for entity in list_entity:
- if entity.entity_type in ['org','company']:
- values = entity.values
- role_prob = float(values[int(entity.label)])
- _key = entity.packageName+"$$"+str(entity.label)
- if role_prob>on_value and str(entity.label)!="5":
- if _key in combination.keys() and combination[_key]==entity.entity_text:
- expect += math.pow(role_prob,4)
- else:
- expect -= math.pow(role_prob,4)
- '''
- #修改为同一个实体只取对应包-角色的最大的概率值
- expect = 0
- dict_entity_prob = {}
- for _key_pack_entity in dict_pack_entity_prob:
- _key_pack = _key_pack_entity.split("$text$")[0]
- role_prob = dict_pack_entity_prob[_key_pack_entity][1]
- if _key_pack in combination.keys() and combination[_key_pack]==dict_pack_entity_prob[_key_pack_entity][0]:
- if _key_pack_entity in dict_entity_prob.keys():
- if dict_entity_prob[_key_pack_entity]<role_prob:
- dict_entity_prob[_key_pack_entity] = role_prob
- else:
- dict_entity_prob[_key_pack_entity] = role_prob
- else:
- if _key_pack_entity in dict_entity_prob.keys():
- if dict_entity_prob[_key_pack_entity]>-role_prob:
- dict_entity_prob[_key_pack_entity] = -role_prob
- else:
- dict_entity_prob[_key_pack_entity] = -role_prob
- # for entity in list_entity:
- # if entity.entity_type in ['org','company']:
- # values = entity.values
- # role_prob = float(values[int(entity.label)])
- # _key = entity.packageName+"$$"+str(entity.label)
- # if role_prob>=on_value and str(entity.label)!="5":
- # if _key in combination.keys() and combination[_key]==entity.entity_text:
- # _key_prob = _key+entity.entity_text
- # if _key_prob in dict_entity_prob.keys():
- # if dict_entity_prob[_key_prob]<role_prob:
- # dict_entity_prob[_key_prob] = role_prob
- # else:
- # dict_entity_prob[_key_prob] = role_prob
- # else:
- # _key_prob = _key+entity.entity_text
- # if _key_prob in dict_entity_prob.keys():
- # if dict_entity_prob[_key_prob]>-role_prob:
- # dict_entity_prob[_key_prob] = -role_prob
- # else:
- # dict_entity_prob[_key_prob] = -role_prob
- for _key in dict_entity_prob.keys():
- symbol = 1 if dict_entity_prob[_key]>0 else -1
- expect += symbol*math.pow(dict_entity_prob[_key],2)
- return expect
- def getRoleList(list_sentence,list_entity,on_value = 0.5):
- '''
- @summary: 搜索树,得到所有不矛盾的角色组合,取合计期望值最大的作为结果返回
- @param:
- list_sentence:文章所有的sentence
- list_entity:文章所有的实体
- on_value:概率阈值
- @return:文章的角色list
- '''
- pack = getPackagesFromArticle(list_sentence,list_entity)
- if pack is None:
- return None
- # PackageList,PackageSet,dict_PackageCode = pack
- PackageList,PackageSet,dict_PackageCode,main_body_pack = pack
- #拿到所有可能的情况
- dict_role_combination = {}
- tenderee_or_agency_set = set() # 记录所有预测为招标或代理的实体集合
- win_tenderer_set = set() # 记录所有预测为中标的实体集合
- # print(PackageList)
- #拿到各个实体的packageName,packageCode
- main_contain_winner = False # 2024/10/11 判断正文是否包含中标人
- for entity in list_entity:
- if entity.entity_type in ['org','company'] and entity.label==2 and entity.values[entity.label]>0.7 and entity.in_attachment==False:
- main_contain_winner = True
- break
- for entity in list_entity:
- if entity.entity_type in ['org','company']:
- #限制附件里角色values[label]最大概率prob
- max_prob = 0.85
- if str(entity.label)!="5" and entity.in_attachment:
- if entity.values[entity.label]>max_prob:
- entity.values[entity.label] = max_prob
- #过滤掉字数小于3个的实体
- if len(entity.entity_text)<=3:
- continue
- values = entity.values
- role_prob = float(values[int(entity.label)])
- if role_prob>=on_value and str(entity.label)!="5":
- if main_contain_winner and entity.in_attachment and entity.label in [2,3,4]: # 2024/10/11 正文包含中标人,不再提取附件中标人 避免 例:504046747 附件角色OCR错字变两个标段
- continue
- if str(entity.label) in ["0","1"]:
- packageName = "Project"
- else:
- if len(PackageSet)>0:
- packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"role-"+str(entity.label))
- if packagePointer is None:
- #continue
- packageName = "Project"
- # print(entity.entity_text, packageName,entity.sentence_index,entity.begin_index)
- else:
- #add pointer_pack
- entity.pointer_pack = packagePointer
- packageName = packagePointer.entity_text
- # print(entity.entity_text, packageName)
- else:
- packageName = "Project"
- find_flag = False
- if packageName in dict_PackageCode.keys():
- packageCode = dict_PackageCode[packageName]
- else:
- packageCode = ""
- entity.packageCode = packageCode
- role_name = dict_role_id.get(str(entity.label))
- entity.roleName = role_name
- entity.packageName = packageName
- if entity.packageName in dict_role_combination.keys():
- if str(entity.label) in dict_role_combination[entity.packageName].keys():
- dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
- else:
- dict_role_combination[entity.packageName][str(entity.label)] = set([entity.entity_text])
- else:
- dict_role_combination[entity.packageName] = {}
- #初始化空值
- roleIds = [0,1,2,3,4]
- for _roleId in roleIds:
- dict_role_combination[entity.packageName][str(_roleId)] = set([""])
- dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
- list_real_comba = get_legal_comba(list_entity,dict_role_combination)
- # print("===role_combination",dict_role_combination)
- # print("== real_comba",list_real_comba)
- #拿到最大期望值的组合
- max_index = 0
- max_expect = -100
- _index = 0
- dict_pack_entity_prob = get_dict_entity_prob(list_entity)
- for item_combination in list_real_comba:
- expect = getSumExpectation(dict_pack_entity_prob, item_combination)
- if expect>max_expect:
- max_index = _index
- max_expect = expect
- _index += 1
- RoleList = []
- RoleSet = set()
- if len(list_real_comba)>0:
- for _key in list_real_comba[max_index].keys():
- packageName = _key.split("$$")[0]
- label = _key.split("$$")[1]
- role_name = dict_role_id.get(str(label))
- entity_text = list_real_comba[max_index][_key]
- entity_prob = dict_pack_entity_prob.get(_key+'$text$'+entity_text, ['',0])[1]
- # entity_text = list_real_comba[max_index][_key][0]
- # entity_prob = list_real_comba[max_index][_key][1]
- if packageName in dict_PackageCode.keys():
- packagecode = dict_PackageCode.get(packageName)
- else:
- packagecode = ""
- RoleList.append(PREM(packageName,packagecode,role_name,entity_text,entity_prob,0,0.0,[]))
- if str(label) in ["0", "1"]:
- tenderee_or_agency_set.add(entity_text)
- elif str(label) in ["2"] and entity_prob > 0.8:
- win_tenderer_set.add(entity_text)
- # if len(list_real_comba) > 1 and label == '2': # 20240809 由于包号对应不上注销
- # multi_winner = []
- # for comba in list_real_comba:
- # tmp_ent = comba.get(_key, '')
- # tmp_prob = dict_pack_entity_prob.get(_key+'$text$'+tmp_ent, ['',0])[1]
- # if tmp_ent !='' and tmp_prob>0.8:
- # multi_winner.append(comba[_key])
- # if len(set(multi_winner)) > 1:
- # RoleList[-1].multi_winner = multi_winner
- # print('RoleList: ', RoleList)
- RoleSet.add(entity_text)
- #根据最优树来修正list_entity中角色对包的连接
- for _entity in list_entity:
- if _entity.pointer_pack is not None:
- _pack_name = _entity.pointer_pack.entity_text
- _find_flag = False
- for _prem in RoleList:
- if _prem.packageName==_pack_name and _prem.entity_text==_entity.entity_text:
- _find_flag = True
- if not _find_flag:
- _entity.pointer_pack = None
- return RoleList,RoleSet,PackageList,PackageSet,win_tenderer_set,tenderee_or_agency_set,main_body_pack
- def getPackageScopePattern():
- '''
- @summary: 获取包的作用域关键词
- '''
- df = pd.read_excel(os.path.dirname(__file__)+"/end.xls")
- pattern = "("
- for item in df["list_word"]:
- item = str(item).replace("(","\(").replace(")","\)").replace(".","\.").replace("[","\[").replace("]","\]").replace("-","\-")
- pattern += item+"|"
- pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}|##attachment##"
- return pattern
-
- pattern_packageScope = getPackageScopePattern()
- def getPackagesFromArticle(list_sentence, list_entity):
- '''
- @param:
- list_sentence:文章的句子list
- @summary: 将包的信息插入list_entity中
- @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息
- '''
- if len(list_sentence) == 0:
- return None
- list_sentence.sort(key=lambda x: x.sentence_index)
- PackageList = []
- PackageList_scope = []
- PackageSet = set()
- dict_packageCode = dict()
- main_body_pack = set() # 2024/04/28 保存正文包号
- # package_number_pattern = re.compile(
- # '((施工|监理|监测|勘察|设计|劳务)(标段)?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{,4}(标段?|包))|(([a-zA-Z]包[:)]?)?第?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{1,4}标[段包]?)|((标[段号的包项]|([标分子]|合同|项目|采购|()包|包[组件号])[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦA-Za-z]{1,4})|(([,;。、:(]|第)[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}分?包)|([a-zA-Z][0-9]{,3}分?[包标])|.{,1}((包组|包件|包号|分?包|标[段号的包]|子项目)编?号?[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]+)|[,;。、:(]包[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\w]') # 标号
- # package_number_pattern = re.compile(
- # '((施工|监理|监测|勘察|设计|劳务)(标段)?:?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})?[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
- # |(([a-zA-Z]包[:()]?)?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|合同[包段]))\
- # |(([,;。、:(]|第)?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
- # |((标[段包项]|标段(包)|包[组件标]|[标分子(]包)(\[|【)?:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9}))\
- # |[,;。、:(](标的?|项目|子项目?)(\[|【)?:?([一二三四五六七八九十]+|[0-9]{1,9})\
- # |((([标分子(]|合同|项目|采购)包|[,。]标的|子项目|[分子]标|标[段包项]|包[组件标]?)编?号[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{1,9})\
- # |[,;。、:(]?(合同|分|子)?包:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})')
- other_package_pattern = re.compile(
- '((项目|物资|设备|场次|标段|标的|产品)(名称)?)[::]([^,。]{2,50}?)[,。]') # # 2020/11/23 大网站规则 调整 package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
- win_tenderer_pattern = re.compile('(中标候?选?人|供应商)(名称)?[::](.{2,25})[,。]') # 2020/11/23 大网站规则 调整
- model_pattern = re.compile('(型号|序号)[::]([^,。]{2,20})[,。]') # 2020/11/23 大网站规则 调整
- number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}")
- package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z\(\)]{1,20})")
- # 纯数字类型的包号统一,例如:'01','1'
- re_digital = re.compile("^\d+$")
- def changeIndexFromWordToWords(tokens, word_index):
- '''
- @summary:转换某个字的字偏移为词偏移
- '''
- before_index = 0
- after_index = 0
- for i in range(len(tokens)):
- after_index = after_index + len(tokens[i])
- if before_index <= word_index and after_index >= word_index:
- return i
- before_index = after_index
- package_names = []
- def extractPackageCode(tokens, word_index, size=20, pattern=package_code_pattern):
- '''
- @summary:抽取包附近的标段号
- @param:
- tokens:包所在句子的分词
- word_index:包所在字偏移
- size:左右各取多少个词
- pattern:提取标段号的正则
- @return: type:string,meaning:标段号
- '''
- index = changeIndexFromWordToWords(tokens, word_index)
- if index < size:
- begin = index
- else:
- begin = index - size
- if index + size > len(tokens):
- end = len(tokens)
- else:
- end = index + size
- # 拿到左右两边的词语组成短语
- text = "".join(tokens[begin:end])
- # 在短语中的字偏移
- new_word_index = word_index - len("".join(tokens[:begin]))
- min_distance = len(text)
- packageCode = None
- for the_iter in re.finditer(pattern, text):
- # 算出最小距离
- distance = min([abs(new_word_index - the_iter.span()[0]), abs(new_word_index - the_iter.span()[1])])
- if distance < min_distance:
- min_distance = distance
- packageCode = the_iter.group(1)
- return packageCode
- def get_package():
- PackageList_scope = []
- True_package = set()
- for i in range(len(list_sentence)):
- PackageList_item = []
- PackageList_item_scope = []
- content = list_sentence[i].sentence_text
- # content = content.replace('号,', '号:').replace(':', ':').replace('(', '(').replace(')', ')')
- # # .replace('-包',' 包').replace('包-', '包 ').replace('-标', ' 标').replace('标段-', '标段 ').replace('-合同包', ' 合同包') # 72760191 标段:№10
- # content = re.sub('[一二三四五六七八九十\d](标[段包项]|包[组件标])编号', ' 标段编号', content)
- #
- # for it in re.finditer('CA标|(每个?|所有|相关|个|各|不分)[分子]?(标[段包项]?|包[组件标]?|合同包)|(质量|责任)三包|包[/每]|标段(划分|范围)|(承|压缩|软|皮|书|挂)包\
- # |标[识注签贴配]|[商油]标号|第X包|第[一二三四五六七八九十]+至[一二三四五六七八九十]+(标[段包项]?|包[组件标]?|合同[包段])\
- # |\.(docx|doc|pdf|xlsx|xls|jpg)|[一二三四五]次|五金|\d+[年月]|[\d.,]+万?元|\d+\.\d+', content):
- # content = content.replace(it.group(0), ' ' * len(it.group(0)))
- # tokens = list_sentence[i].tokens
- # _names = []
- # for iter in re.finditer(package_number_pattern, content):
- # if re.search('(业绩|信誉要求):', content[:iter.start()]): # 前面有业绩或信誉的标段去掉
- # continue
- # # print('提取到标段:%s, 前后文:%s'%(iter.group(), content[iter.start()-5:iter.end()+5]))
- # if re.match('\d', iter.group(0)) and re.search('\d\.$', content[:iter.start()]): # 排除2.10标段3 5.4标段划分 这种情况
- # # print('过滤掉错误包:', iter.group())
- # continue
- # if re.search('[承每书/]包|XX|xx', iter.group(0)) or re.search('\d包[/每]\w|一包[0-9一二三四五六七八九十]+', content[iter.start():iter.end()+3]) or re.search('[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{6,}', iter.group(0)):
- # # print('过滤掉错误包:', iter.group())
- # continue
- # elif iter.end()+2 < len(content) and re.search('标准|标的物|标志|包装|划分|标书', content[iter.start():iter.end()+2]):
- # # print('过滤掉错误包:',iter.group())
- # continue
- # elif re.search('同一(标段?|包)', content[max(0, iter.start()-2):iter.end()]): # 不得参加同一标段
- # # print('过滤掉错误包:', iter.group())
- # continue
- # elif re.search('三包', content[max(0, iter.start()-2):iter.end()]) and re.search('第三包', content[max(0, iter.start()-2):iter.end()])==None: # 规规章和“三包”规定
- # # print('过滤掉错误包:', iter.group())
- # continue
- # elif re.search('[1-9]\d{2,}$|\d{4,}|^[1-9]\d{2,}|合同包[A-Za-z]{2,}', iter.group(0)):
- # # print('过滤掉错误包号5:', iter.group(0))
- # continue
- tokens = list_sentence[i].tokens
- _names = []
- for iter in find_package(content):
- temp_package_number = uniform_package_name(iter.group(0))
- True_package.add(temp_package_number)
- PackageList_item.append({"name": temp_package_number, "sentence_index": list_sentence[i].sentence_index,
- "offsetWords_begin": changeIndexFromWordToWords(tokens, iter.span()[0]),
- "offsetWord_begin": iter.span()[0], "offsetWord_end": iter.span()[1]})
- # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
- code = extractPackageCode(tokens, iter.span()[0])
- if code is not None:
- dict_packageCode[temp_package_number] = code
- PackageSet.add(temp_package_number)
- if not list_sentence[i].in_attachment: # 保存不在附件的包号
- main_body_pack.add(temp_package_number)
- # 识别packageScope
- for iter in re.finditer(pattern_packageScope, content):
- PackageList_item_scope.append({"name": "", "sentence_index": list_sentence[i].sentence_index,
- "offsetWords_begin": changeIndexFromWordToWords(tokens, iter.span()[0]),
- "offsetWord_begin": iter.span()[0], "offsetWord_end": iter.span()[1]})
- # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
- PackageList_item_scope = PackageList_item + PackageList_item_scope
- PackageList_item_scope.sort(key=lambda x: x["offsetWord_begin"])
- PackageList_scope = PackageList_scope + PackageList_item_scope
- PackageList_item.sort(key=lambda x: x["sentence_index"])
- return PackageList_scope, True_package
- def get_win_project():
- '''获取多个项目多个中标人的项目'''
- PackageList_scope = []
- True_package = set()
- # 2020/11/23 大网站规则 调整
- if len(PackageSet) == 0 and len(
- set([it.entity_text for it in list_entity if
- it.entity_type in ['org', 'company'] and it.label == 2])) > 1:
- for i in range(len(list_sentence)):
- PackageList_item = []
- PackageList_item_scope = []
- content = list_sentence[i].sentence_text
- tokens = list_sentence[i].tokens
- names = re.findall(other_package_pattern, content)
- N_names = re.findall(win_tenderer_pattern, content)
- if len(names) != 1 or len(N_names) != 1:
- continue
- for iter in re.finditer(other_package_pattern, content):
- temp_package_number = iter.group(4)
- xinghao = re.search(model_pattern, content)
- if xinghao:
- temp_package_number = temp_package_number + '+' + xinghao.group(2)
- # print('新正则采购包名补充',temp_package_number)
- if re.search(re_digital, temp_package_number):
- temp_package_number = str(int(temp_package_number))
- True_package.add(temp_package_number)
- PackageList_item.append(
- {"name": temp_package_number, "sentence_index": list_sentence[i].sentence_index,
- "offsetWords_begin": changeIndexFromWordToWords(tokens, iter.span()[0]),
- "offsetWord_begin": iter.span()[0], "offsetWord_end": iter.span()[1]})
- # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
- code = extractPackageCode(tokens, iter.span()[0])
- if code is not None:
- dict_packageCode[temp_package_number] = code
- PackageSet.add(temp_package_number)
- if not list_sentence[i].in_attachment: # 保存不在附件的包号
- main_body_pack.add(temp_package_number)
- # 识别packageScope
- for iter in re.finditer(pattern_packageScope, content):
- PackageList_item_scope.append({"name": "", "sentence_index": list_sentence[i].sentence_index,
- "offsetWords_begin": changeIndexFromWordToWords(tokens,
- iter.span()[0]),
- "offsetWord_begin": iter.span()[0],
- "offsetWord_end": iter.span()[1]})
- # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
- PackageList_item_scope = PackageList_item + PackageList_item_scope
- PackageList_item_scope.sort(key=lambda x: x["offsetWord_begin"])
- PackageList_scope = PackageList_scope + PackageList_item_scope
- PackageList_item.sort(key=lambda x: x["sentence_index"])
- return PackageList_scope, True_package
- def get_package_scope(PackageList_scope):
- PackageList = []
- pattern_punctuation = "[::()\(\),,。;;]"
- # print("===packageList_scope",PackageList_scope)
- for i in range(len(list_sentence)):
- for j in range(len(PackageList_scope)):
- if i == PackageList_scope[j]["sentence_index"] and PackageList_scope[j]["name"] != "":
- _flag = False
- left_str = list_sentence[i].sentence_text[
- PackageList_scope[j]["offsetWord_begin"] - 30:PackageList_scope[j][
- "offsetWord_begin"] + 1]
- right_str = list_sentence[i].sentence_text[
- PackageList_scope[j]["offsetWord_begin"]:PackageList_scope[j]["offsetWord_begin"] + 30]
- _left_find = re.findall(pattern_punctuation, left_str)
- _right_find = re.findall(pattern_punctuation, right_str)
- # print(left_str)
- if re.search("同", left_str[-1:]) is not None and PackageList_scope[j]["name"] == "一":
- continue
- if re.search("划分", right_str[:10]) is not None:
- continue
- if len(_left_find) > 0 and _left_find[-1] in [":", ":"]:
- _flag = True
- if len(_right_find) > 0 and _right_find[0] in [":", ":"]:
- _flag = True
- if _flag:
- scope_begin = [PackageList_scope[j]["sentence_index"],
- PackageList_scope[j]["offsetWords_begin"]]
- else:
- scope_begin = [PackageList_scope[j]["sentence_index"], 0] # 2024/10/10 改为包作用域开始位置为包号所在句子开头
- # if j == 0:
- # scope_begin = [0, 0]
- # else:
- # scope_begin = [PackageList_scope[j - 1]["sentence_index"],
- # PackageList_scope[j - 1]["offsetWords_begin"]]
- if j == len(PackageList_scope) - 1:
- scope_end = [list_sentence[-1].sentence_index,
- changeIndexFromWordToWords(list_sentence[-1].tokens,
- len(list_sentence[
- -1].sentence_text))]
- else:
- scope_end = [PackageList_scope[j + 1]["sentence_index"],
- PackageList_scope[j + 1]["offsetWords_begin"]]
- if j>0 and PackageList_scope[j - 1]["sentence_index"] == PackageList_scope[j]["sentence_index"] and \
- PackageList_scope[j - 1]["offsetWord_begin"] <= PackageList_scope[j]["offsetWord_begin"] and \
- PackageList_scope[j - 1]["offsetWord_end"] >= PackageList_scope[j]["offsetWord_end"]:
- continue
- # add package to entity
- _pack_entity = Entity(doc_id=list_sentence[0].doc_id, entity_id="%s_%s_%s_%s" % (
- list_sentence[0].doc_id, i, PackageList_scope[j]["offsetWord_begin"],
- PackageList_scope[j]["offsetWord_begin"]), entity_text=PackageList_scope[j]["name"],
- entity_type="package", sentence_index=PackageList_scope[j]["sentence_index"],
- begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,
- PackageList_scope[j][
- "offsetWord_begin"]),
- end_index=changeIndexFromWordToWords(list_sentence[i].tokens,
- PackageList_scope[j]["offsetWord_end"]),
- wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],
- wordOffset_end=PackageList_scope[j]["offsetWord_end"],
- in_attachment=list_sentence[i].in_attachment)
- list_entity.append(_pack_entity)
- copy_pack = copy.copy(PackageList_scope[j])
- copy_pack["scope"] = [scope_begin, scope_end]
- copy_pack["hit"] = set()
- copy_pack["pointer"] = _pack_entity
- PackageList.append(copy_pack)
- return PackageList
- PackageList_scope, True_package = get_package()
- # PackageList_scope2, True_package2 = get_win_project() # 20240508 与表格提取重复,去掉
- # if len(True_package2) > 2: # 同时包含多标段及多中标人的
- # PackageList_scope = PackageList_scope + PackageList_scope2
- PackageList = get_package_scope(PackageList_scope)
- # if len(PackageSet)<2: # 20230922只提取到一个包号的去掉,都放在默认包project 2024/02/02 注释掉,防止多标段每篇公告只公布一个标段的没法提取标段号
- # return [], set(), {}
- return PackageList, PackageSet, dict_packageCode, main_body_pack
- # km配对方法
- def dispatch(match_list):
- main_roles = list(set([match.main_role for match in match_list]))
- attributes = list(set([match.attribute for match in match_list]))
- label = np.zeros(shape=(len(main_roles), len(attributes)))
- for match in match_list:
- main_role = match.main_role
- attribute = match.attribute
- value = match.value
- label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000
- # print(label)
- gragh = -label
- # km算法
- row, col = linear_sum_assignment(gragh)
- max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value]
- # return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch]
- return [(main_roles[row], attributes[col]) for row, col in max_dispatch]
- from BiddingKG.dl.common.Utils import getUnifyMoney
- from BiddingKG.dl.interface.modelFactory import Model_relation_extraction
- relationExtraction_model = Model_relation_extraction()
- def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_sentence,list_entity,list_outline,on_value = 0.5,on_value_person=0.5,sentence_len=4):
- '''
- @param:
- PackDict:文章包dict
- roleSet:文章所有角色的公司名称
- PackageList:文章的包信息
- PackageSet:文章所有包的名称
- list_entity:文章所有经过模型处理的实体
- on_value:金额模型的阈值
- on_value_person:联系人模型的阈值
- sentence_len:公司和属性间隔句子的最大长度
- @return:添加了属性信息的角色list
- '''
-
- #根据roleid添加金额到rolelist中
- def addMoneyByRoleid(packDict,packageName,roleid,money,money_prob):
- for i in range(len(packDict[packageName]["roleList"])):
- if packDict[packageName]["roleList"][i].role_name==dict_role_id.get(str(roleid)):
- if money_prob>packDict[packageName]["roleList"][i].money_prob:
- packDict[packageName]["roleList"][i].money = money
- packDict[packageName]["roleList"][i].money_prob = money_prob
- return packDict
-
- #根据实体名称添加金额到rolelist中
- def addMoneyByEntity(packDict,packageName,entity,money,money_prob):
- for i in range(len(packDict[packageName]["roleList"])):
- if packDict[packageName]["roleList"][i].entity_text==entity:
- # if money_prob>packDict[packageName]["roleList"][i].money_prob:
- # packDict[packageName]["roleList"][i].money = money
- # packDict[packageName]["roleList"][i].money_prob = money_prob
- if packDict[packageName]["roleList"][i].money_prob==0 : # 2021/7/20第一次更新金额
- if money.notes == '单价':
- packDict[packageName]["roleList"][i].unit_price = money.entity_text
- else:
- packDict[packageName]["roleList"][i].money = money.entity_text
- packDict[packageName]["roleList"][i].money_prob = money_prob
- packDict[packageName]["roleList"][i].money_unit = money.money_unit
- elif money_prob>packDict[packageName]["roleList"][i].money_prob+0.2 or (money.notes in ['大写'] and money.in_attachment==False): # 2021/7/20改为优先选择大写金额,
- # print('已连接金额概率:money_prob:',packDict[packageName]["roleList"][i].money_prob)
- # print('链接金额备注 ',money.notes, money.entity_text, money.values)
- if money.notes == '单价':
- packDict[packageName]["roleList"][i].unit_price = money.entity_text
- else:
- packDict[packageName]["roleList"][i].money = money.entity_text
- packDict[packageName]["roleList"][i].money_prob = money_prob
- packDict[packageName]["roleList"][i].money_unit = money.money_unit
- # print('链接中的金额:{0}, 单位:{1}'.format(money.entity_text, money.money_unit))
- return packDict
- def addRatioByEntity(packDict,packageName,entity,ratio):
- for i in range(len(packDict[packageName]["roleList"])):
- if packDict[packageName]["roleList"][i].entity_text==entity:
- packDict[packageName]["roleList"][i].ratio = ratio.ratio_value
- def addServiceTimeByEntity(packDict,packageName,entity,serviceTime):
- for i in range(len(packDict[packageName]["roleList"])):
- if packDict[packageName]["roleList"][i].entity_text==entity and not packDict[packageName]["roleList"][i].serviceTime:
- # packDict[packageName]["roleList"][i].serviceTime = serviceTime.entity_text
- packDict[packageName]["roleList"][i].serviceTime = extract_serviceTime(serviceTime.entity_text,"")
- #根据实体名称得到角色
- def getRoleWithText(packDict,entity_text):
- for pack in packDict.keys():
- for i in range(len(packDict[pack]["roleList"])):
- if packDict[pack]["roleList"][i].entity_text==entity_text:
- return packDict[pack]["roleList"][i].role_name
-
- def doesEntityOrLinkedEntity_inRoleSet(entity,RoleSet):
- _list_entitys = [entity]+entity.linked_entitys
- for _entity in _list_entitys:
- if _entity.entity_text in RoleSet:
- return True
-
- p_entity = 0
- # 2021/7/19 顺序比较金额,前面是后面的一万倍则把前面金额/10000
- # money_list = [it for it in list_entity if it.entity_type=="money"]
- # for i in range(len(money_list)-1):
- # for j in range(1, len(money_list)):
- # if (float(money_list[i].entity_text) > 5000000000 or money_list[j].notes=='大写') and \
- # Decimal(money_list[i].entity_text)/Decimal(money_list[j].entity_text)==10000:
- # money_list[i].entity_text = str(Decimal(money_list[i].entity_text)/10000)
- # # print('连接前修改大于50亿金额:前面是后面的一万倍则把前面金额/10000')
- '''同样金额同时有元及万元单位的,把万元的金额改为元'''
- wanyuan = []
- yuan = []
- for it in list_entity:
- if it.entity_type == "money" and float(it.entity_text)>1000000: # 20240523 修改为百万以上金额才对比万倍关系,其他又行业限额纠正避免有些万元单位提取不到从而被除一万 例:52435607 最高限价(万元):22679.32 蜀冈招标控制价22679.32工程地点南路西侧(万元)
- if it.money_unit == '万元' or float(it.entity_text)>5000000000:
- wanyuan.append(it)
- if it.money_unit == '元' or float(it.entity_text)<5000000:
- yuan.append(it)
- if wanyuan != [] and yuan != []:
- for m1 in wanyuan:
- for m2 in yuan:
- if Decimal(m1.entity_text)/Decimal(m2.entity_text) == 10000:
- m1.entity_text = m2.entity_text
- #遍历所有实体
- # while(p_entity<len(list_entity)):
- # entity = list_entity[p_entity]
- '''
- #招标金额从后往前找
- if entity.entity_type=="money":
- if entity.values[entity.label]>=on_value:
- if str(entity.label)=="0":
- packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
- if packagePointer is None:
- packageName = "Project"
- else:
- packageName = packagePointer.entity_text
- addMoneyByRoleid(PackDict, packageName, "0", entity.entity_text, entity.values[entity.label])
- '''
- ''' # 2020/11/25 与下面的联系人连接步骤重复,取消
- if entity.entity_type=="person":
- if entity.values[entity.label]>=on_value_person:
- if str(entity.label)=="1":
- for i in range(len(PackDict["Project"]["roleList"])):
- if PackDict["Project"]["roleList"][i].role_name=="tenderee":
- PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
- # add pointer_person
- for _entity in list_entity:
- if dict_role_id.get(str(_entity.label))=="tenderee":
- for i in range(len(PackDict["Project"]["roleList"])):
- if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
- _entity.pointer_person = entity
- elif str(entity.label)=="2":
- for i in range(len(PackDict["Project"]["roleList"])):
- if PackDict["Project"]["roleList"][i].role_name=="agency":
- PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
- # add pointer_person
- for _entity in list_entity:
- if dict_role_id.get(str(_entity.label))=="agency":
- for i in range(len(PackDict["Project"]["roleList"])):
- if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
- _entity.pointer_person = entity
- '''
- # #金额往前找实体
- # if entity.entity_type=="money":
- # if entity.values[entity.label]>=on_value:
- # p_entity_money= p_entity
- # entity_money = list_entity[p_entity_money]
- # if len(PackageSet)>0:
- # packagePointer,_ = getPackage(PackageList,entity_money.sentence_index,entity_money.begin_index,"money-"+str(entity_money.entity_text)+"-"+str(entity_money.label))
- # if packagePointer is None:
- # packageName_entity = "Project"
- # else:
- # packageName_entity = packagePointer.entity_text
- # else:
- # packageName_entity = "Project"
- # while(p_entity_money>0):
- # entity_before = list_entity[p_entity_money]
- # if entity_before.entity_type in ['org','company']:
- # if str(entity_before.label)=="1":
- # addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label])
- # #add pointer_money
- # entity_before.pointer_money = entity_money
- # break
- # p_entity_money -= 1
- #如果实体属于角色集合,则往后找属性
- # if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet):
- #
- # p_entity += 1
- # #循环查找符合的属性
- # while(p_entity<len(list_entity)):
- #
- # entity_after = list_entity[p_entity]
- # if entity_after.sentence_index-entity.sentence_index>=sentence_len:
- # p_entity -= 1
- # break
- # #若是遇到公司实体,则跳出循环
- # if entity_after.entity_type in ['org','company']:
- # p_entity -= 1
- # break
- # if entity_after.values is not None:
- # if entity_after.entity_type=="money":
- # if entity_after.values[entity_after.label]>=on_value:
- # '''
- # #招标金额从后往前找
- # if str(entity_after.label)=="0":
- # packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
- # if packagePointer is None:
- # packageName = "Project"
- # else:
- # packageName = packagePointer.entity_text
- # addMoneyByRoleid(PackDict, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label])
- # '''
- # if str(entity_after.label)=="1":
- # #print(entity_after.entity_text,entity.entity_text)
- # _list_entitys = [entity]+entity.linked_entitys
- # if len(PackageSet)>0:
- # packagePointer,_ = getPackage(PackageList,entity_after.sentence_index,entity_after.begin_index,"money-"+str(entity_after.entity_text)+"-"+str(entity_after.label))
- # if packagePointer is None:
- # packageName_entity = "Project"
- # else:
- # packageName_entity = packagePointer.entity_text
- # else:
- # packageName_entity = "Project"
- # if str(entity.label) in ["2","3","4"]:
- # # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
- # if entity_after.notes == '单价' or float(entity_after.entity_text)<5000: #2021/12/17 调整小金额阈值,避免203608823.html 两次金额一次万元没提取到的情况
- # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
- # 0.5)
- # entity.pointer_money = entity_after
- # # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
- # else:
- # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
- # entity_after.values[entity_after.label])
- # entity.pointer_money = entity_after
- # # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
- # if entity_after.values[entity_after.label]>0.6:
- # break # 2021/7/16 新增,找到中标金额,非单价即停止,不再往后找金额
- # #add pointer_money
- # # entity.pointer_money = entity_after
- # # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
- # # if entity_after.notes!='单价':
- # # break # 2021/7/16 新增,找到中标金额即停止,不再往后找金额
- # '''
- # if entity_after.entity_type=="person":
- # if entity_after.values[entity_after.label]>=on_value_person:
- # if str(entity_after.label)=="1":
- # for i in range(len(roleList)):
- # if roleList[i].role_name=="tenderee":
- # roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
- # elif str(entity_after.label)=="2":
- # for i in range(len(roleList)):
- # if roleList[i].role_name=="agency":
- # roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
- # elif str(entity_after.label)=="3":
- # _list_entitys = [entity]+entity.linked_entitys
- # for _entity in _list_entitys:
- # for i in range(len(roleList)):
- # if roleList[i].entity_text==_entity.entity_text:
- # if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0:
- # break
- # roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
- # '''
- #
- # p_entity += 1
- #
- # p_entity += 1
- # 记录每句的分词数量
- tokens_num_dict = dict()
- last_tokens_num = 0
- for sentence in list_sentence:
- _index = sentence.sentence_index
- if _index == 0:
- tokens_num_dict[_index] = 0
- else:
- tokens_num_dict[_index] = tokens_num_dict[_index - 1] + last_tokens_num
- last_tokens_num = len(sentence.tokens)
- attribute_type = ['money','serviceTime','ratio']# 'money'仅指“中投标金额”
- for link_attribute in attribute_type:
- temp_entity_list = []
- if link_attribute=="money":
- temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or
- (ent.entity_type=='money' and ent.label==1 and ent.values[ent.label]>=0.5)]
- # 删除重复的‘中投标金额’,一般为大小写两种样式
- drop_tendererMoney = []
- for ent_idx in range(len(temp_entity_list)-1):
- entity = temp_entity_list[ent_idx]
- if entity.entity_type=='money':
- next_entity = temp_entity_list[ent_idx+1]
- if next_entity.entity_type=='money':
- if getUnifyMoney(entity.entity_text)==getUnifyMoney(next_entity.entity_text):
- if (tokens_num_dict[next_entity.sentence_index] + next_entity.begin_index) - (
- tokens_num_dict[entity.sentence_index] + entity.end_index) < 10:
- drop_tendererMoney.append(next_entity)
- for _drop in drop_tendererMoney:
- temp_entity_list.remove(_drop)
- elif link_attribute=="serviceTime":
- temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or
- ent.entity_type=='serviceTime']
- elif link_attribute=="ratio":
- temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or
- ent.entity_type=='ratio']
- temp_entity_list = sorted(temp_entity_list,key=lambda x: (x.sentence_index, x.begin_index))
- temp_match_list = []
- for ent_idx in range(len(temp_entity_list)):
- entity = temp_entity_list[ent_idx]
- if entity.entity_type in ['org','company']:
- match_nums = 0
- tenderer_nums = 0 #经过其他中投标人的数量
- byNotTenderer_match_nums = 0 #跟在中投标人后面的属性
- for after_index in range(ent_idx + 1, min(len(temp_entity_list), ent_idx + 4)):
- after_entity = temp_entity_list[after_index]
- if entity.in_attachment != after_entity.in_attachment: # 正文与附件的不能相连
- break
- if after_entity.entity_type == link_attribute:
- distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
- tokens_num_dict[entity.sentence_index] + entity.end_index)
- sentence_distance = after_entity.sentence_index - entity.sentence_index
- value = (-1 / 2 * (distance ** 2)) / 10000
- if link_attribute == "money":
- if after_entity.notes == '单价':
- value = value * 100
- if sentence_distance == 0:
- if distance < 100:
- # value = (-1 / 2 * (distance ** 2)) / 10000
- temp_match_list.append(Match(entity, after_entity, value))
- match_nums += 1
- if not tenderer_nums:
- byNotTenderer_match_nums += 1
- else:
- break
- else:
- if distance < 60:
- # value = (-1 / 2 * (distance ** 2)) / 10000
- temp_match_list.append(Match(entity, after_entity, value))
- match_nums += 1
- if not tenderer_nums:
- byNotTenderer_match_nums += 1
- else:
- break
- else:
- tenderer_nums += 1
- #前向查找属性
- if ent_idx!=0 and (not match_nums or not byNotTenderer_match_nums):
- previous_entity = temp_entity_list[ent_idx - 1]
- if previous_entity.entity_type == link_attribute:
- # if previous_entity.sentence_index == entity.sentence_index:
- distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
- tokens_num_dict[previous_entity.sentence_index] + previous_entity.end_index)
- if distance < 40:
- # 前向 没有 /10000
- value = (-1 / 2 * (distance ** 2))
- temp_match_list.append(Match(entity, previous_entity, value))
- # km算法分配求解
- dispatch_result = dispatch(temp_match_list)
- dispatch_result = sorted(dispatch_result, key=lambda x: (x[0].sentence_index,x[0].begin_index))
- for match in dispatch_result:
- _entity = match[0]
- _attribute = match[1]
- if link_attribute=='money':
- _entity.pointer_money = _attribute
- packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index,
- "money-" + str(_attribute.entity_text) + "-" + str(_attribute.label))
- # print(_entity.entity_text,_attribute.entity_text)
- if packagePointer is None:
- packageName_entity = "Project"
- else:
- packageName_entity = packagePointer.entity_text
- if _attribute.notes == '单价' or float(_attribute.entity_text) < 5000: # 2021/12/17 调整小金额阈值,避免203608823.html 两次金额一次万元没提取到的情况
- # print(packageName_entity,_attribute.entity_text, _attribute.values[_attribute.label])
- addMoneyByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute,0.5)
- else:
- # print(packageName_entity,_attribute.entity_text, _attribute.values[_attribute.label])
- addMoneyByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute,
- _attribute.values[_attribute.label])
- elif link_attribute=='serviceTime':
- _entity.pointer_serviceTime = _attribute
- packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index,
- "serviceTime-" + str(_attribute.entity_text) + "-" + str(_attribute.label))
- if packagePointer is None:
- packageName_entity = "Project"
- else:
- packageName_entity = packagePointer.entity_text
- addServiceTimeByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute)
- elif link_attribute=='ratio':
- _entity.pointer_ratio = _attribute
- packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index,
- "ratio-" + str(_attribute.entity_text) + "-" + str(_attribute.label))
- if packagePointer is None:
- packageName_entity = "Project"
- else:
- packageName_entity = packagePointer.entity_text
- addRatioByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute)
- ''''''
- # 通过模型分类的招标/代理联系人
- list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index)
- person_list = [entity for entity in list_entity if entity.entity_type == 'person' and entity.label in [1, 2]]
- tenderee_contact = set()
- tenderee_phone = set()
- agency_contact = set()
- agency_phone = set()
- winter_contact = set()
- for _person in person_list:
- if _person.label == 1:
- tenderee_contact.add(_person.entity_text)
- if _person.label == 2:
- agency_contact.add(_person.entity_text)
- # 正则匹配无 '主体/联系人' 的电话
- # 例:"采购人联系方式:0833-5226788,"
- phone_pattern = '(1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' \
- '\+86.?1[3-9]\d{9}|' \
- '0[1-9]\d{1,2}[-—-―][2-9]\d{6,7}/[1-9]\d{6,10}|' \
- '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?.?转\d{1,4}|' \
- '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?[-—-―]\d{1,4}|' \
- '0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?(?=1[3-9]\d{9})|' \
- '0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?)|' \
- '0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?(?=[2-9]\d{6,7})|' \
- '0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?|' \
- '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6}\d?-?\d{,4}|' \
- '[2-9]\d{6,7})'
- re_tenderee_phone = re.compile(
- "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
- # 电话号码
- + phone_pattern)
- # 例:"采购人地址和联系方式:峨边彝族自治县教育局,0833-5226788,"
- re_tenderee_phone2 = re.compile(
- "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)"
- # 电话号码
- + phone_pattern)
- re_agent_phone = re.compile(
- "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
- # 电话号码
- + phone_pattern)
- re_agent_phone2 = re.compile(
- "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)"
- # 电话号码
- + phone_pattern)
- content = ""
- for _sentence in list_sentence:
- content += "".join(_sentence.tokens)
- _content = copy.deepcopy(content)
- while re.search("(.)(,)([^0-9])|([^0-9])(,)(.)", content):
- content_words = list(content)
- for i in re.finditer("(.)(,)([^0-9])", content):
- content_words[i.span(2)[0]] = ""
- for i in re.finditer("([^0-9])(,)(.)", content):
- content_words[i.span(2)[0]] = ""
- content = "".join(content_words)
- content = re.sub("[::]|[\((]|[\))]", "", content)
- _tenderee_phone = re.findall(re_tenderee_phone, content)
- # 更新正则确定的角色属性
- for i in range(len(PackDict["Project"]["roleList"])):
- if PackDict["Project"]["roleList"][i].role_name == "tenderee":
- _tenderee_phone = re.findall(re_tenderee_phone, content)
- if _tenderee_phone:
- for _phone in _tenderee_phone:
- _phone = _phone.split("/") # 分割多个号码
- for one_phone in _phone:
- PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
- tenderee_phone.add(one_phone)
- _tenderee_phone2 = re.findall(re_tenderee_phone2, content)
- if _tenderee_phone2:
- for _phone in _tenderee_phone2:
- _phone = _phone.split("/")
- for one_phone in _phone:
- PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
- tenderee_phone.add(one_phone)
- if PackDict["Project"]["roleList"][i].role_name == "agency":
- _agent_phone = re.findall(re_agent_phone, content)
- if _agent_phone:
- for _phone in _agent_phone:
- _phone = _phone.split("/")
- for one_phone in _phone:
- PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
- agency_phone.add(one_phone)
- _agent_phone2 = re.findall(re_agent_phone2, content)
- if _agent_phone2:
- for _phone in _agent_phone2:
- _phone = _phone.split("/")
- for one_phone in _phone:
- PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
- agency_phone.add(one_phone)
- # 正则提取电话号码实体
- # key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
- phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
- '\+86.?1[3-9]\d{9}|'
- # '0[^0]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
- '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?[-—-―]\d{1,4}|'
- '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=1[3-9]\d{9})|'
- '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?)|'
- '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=[2-9]\d{6,7})|'
- '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?|'
- '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6}\d?-?\d{,4}|'
- '400\d{7}转\d{1,4}|'
- '[2-9]\d{6,7}')
- url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[#$\-_@.&+=\?:/]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
- email_pattern = re.compile("[a-zA-Z0-9][a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*@"
- "[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*(?:\.[a-zA-Z]{2,})")
- phone_entitys = []
- code_entitys = [ent for ent in list_entity if ent.entity_type=='code']
- for _sentence in list_sentence:
- sentence_text = _sentence.sentence_text
- # 过长数字串直接过滤替换
- for _re in re.findall("\d{50,}",sentence_text):
- sentence_text = sentence_text.replace(_re,"#"*len(_re))
- in_attachment = _sentence.in_attachment
- list_tokenbegin = []
- begin = 0
- for i in range(0, len(_sentence.tokens)):
- list_tokenbegin.append(begin)
- begin += len(str(_sentence.tokens[i]))
- list_tokenbegin.append(begin + 1)
- # 排除网址、邮箱、项目编号实体
- error_list = []
- for i in re.finditer(url_pattern, sentence_text):
- error_list.append((i.start(), i.end()))
- for i in re.finditer(email_pattern, sentence_text):
- error_list.append((i.start(), i.end()))
- for code_ent in [ent for ent in code_entitys if ent.sentence_index==_sentence.sentence_index]:
- error_list.append((code_ent.wordOffset_begin,code_ent.wordOffset_end))
- res_set = set()
- for i in re.finditer(phone, sentence_text):
- is_continue = False
- for error_ent in error_list:
- if i.start()>=error_ent[0] and i.end()<=error_ent[1]:
- is_continue = True
- break
- if is_continue:
- continue
- res_set.add((i.group(), i.start(), i.end()))
- res_set = sorted(list(res_set),key=lambda x:x[1])
- # 限制数量,防止异常数据处理时间过长
- res_set = res_set[:200]
- last_phone_mask = True
- error_numStr_index = []
- sentence_phone_list = []
- for item_idx in range(len(res_set)):
- item = res_set[item_idx]
- phone_left = sentence_text[max(0, item[1] - 10):item[1]]
- phone_right = sentence_text[item[2]:item[2] + 10]
- phone_left_num = re.search("[\da-zA-Z\-—-―]+$",phone_left)
- numStr_left = item[1]
- if phone_left_num:
- numStr_left -= len(phone_left_num.group())
- phone_right_num = re.search("^[\da-zA-Z\-—-―]+",phone_right)
- numStr_right = item[2]
- if phone_right_num:
- numStr_right += len(phone_right_num.group())
- numStr_index = (numStr_left,numStr_right)
- if re.search("电话|手机|联系[人方]|联系方式",re.sub(",","",phone_left)):
- pass
- else:
- # 排除“传真号”和其它错误项
- if re.search("传,?真|信,?箱|邮,?[编箱件]|QQ|qq", phone_left):
- if not re.search("电,?话", phone_left):
- error_numStr_index.append(numStr_index)
- last_phone_mask = False
- continue
- if re.search("身份证号?码?|注册[证号]|帐号|编[号码]|报价|费率|标价|证号|价格|型号|附件|代码|列号|行号|税号|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", re.sub(",","",phone_left)):
- error_numStr_index.append(numStr_index)
- last_phone_mask = False
- continue
- if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+|元", phone_right):
- error_numStr_index.append(numStr_index)
- last_phone_mask = False
- continue
- # 号码含有0过多,不符合规则
- if re.search("0{6,}",item[0]):
- error_numStr_index.append(numStr_index)
- last_phone_mask = False
- continue
- # 前后跟着字母
- if re.search("[a-zA-Z/]+$", phone_left) or re.search("^[a-zA-Z/]+", phone_right):
- error_numStr_index.append(numStr_index)
- last_phone_mask = False
- continue
- # 时间日期类排除
- if re.search("时间|日期", phone_left):
- error_numStr_index.append(numStr_index)
- last_phone_mask = False
- continue
- # 排除号码实体为时间格式 ,例如:20150515
- if re.search("^20(1[0-9]|2[0-5])(0[1-9]|1[012])(0[1-9]|[1-2][0-9]|3[01])$",item[0]):
- error_numStr_index.append(numStr_index)
- last_phone_mask = False
- continue
- # 前后跟着长度小于一定值数字的正则排除
- if re.search("\d+[-—-―]?\d*$",phone_left) or re.search("^\d+[-—-―]?\d*",phone_right):
- phone_left_number = re.search("\d+[-—-―]?\d*$",phone_left)
- phone_right_number = re.search("^\d+[-—-―]?\d+",phone_right)
- if phone_left_number:
- if len(phone_left_number.group())<7:
- error_numStr_index.append(numStr_index)
- last_phone_mask = False
- continue
- if phone_right_number:
- if len(phone_right_number.group())<7:
- error_numStr_index.append(numStr_index)
- last_phone_mask = False
- continue
- left_context = re.search("[\da-zA-Z\-—-―]+$",sentence_text[:item[1]])
- if left_context:
- if len(left_context.group()) != len("".join(re.findall(phone, left_context.group()))):
- # if not re.search("(" + phone.pattern + ")$", left_context.group()):
- error_numStr_index.append(numStr_index)
- last_phone_mask = False
- continue
- right_context = re.search("^[\da-zA-Z\-—-―]+", sentence_text[item[2]:])
- if right_context:
- if len(right_context.group()) != len("".join(re.findall(phone, right_context.group()))):
- # if not re.search("^(" + phone.pattern + ")", right_context.group()):
- error_numStr_index.append(numStr_index)
- last_phone_mask = False
- continue
- # if:上一个phone实体不符合条件
- if not last_phone_mask:
- item_start = item[1]
- last_item_end = res_set[item_idx-1][2]
- if item_start - last_item_end<=1 or re.search("^[\da-zA-Z\-—-―、]+$",sentence_text[last_item_end:item_start]):
- error_numStr_index.append(numStr_index)
- last_phone_mask = False
- continue
- sentence_phone_list.append(item)
- last_phone_mask = True
- if error_numStr_index:
- drop_list = []
- for item in sentence_phone_list:
- for err_index in error_numStr_index:
- if (item[1]>=err_index[0] and item[1]<=err_index[1]) or (item[2]>=err_index[0] and item[2]<=err_index[1]) or (item[1]<=err_index[0] and item[2]>=err_index[1]):
- drop_list.append(item)
- break
- for _drop_item in drop_list:
- sentence_phone_list.remove(_drop_item)
- for item in sentence_phone_list:
- for j in range(len(list_tokenbegin)):
- if list_tokenbegin[j] == item[1]:
- begin_index = j
- break
- elif list_tokenbegin[j] > item[1]:
- begin_index = j - 1
- break
- for j in range(begin_index, len(list_tokenbegin)):
- if list_tokenbegin[j] >= item[2]:
- end_index = j - 1
- break
- phone_text = re.sub("[-—-―]+","-",item[0]).replace("(","(").replace(")",")")
- _entity = Entity(_sentence.doc_id, None, phone_text, "phone", _sentence.sentence_index, begin_index, end_index, item[1],
- item[2],in_attachment=in_attachment)
- phone_entitys.append(_entity)
- # print('phone_set:',set([ent.entity_text for ent in phone_entitys]))
- def is_company(entity,text):
- # 判断"公司"实体是否为地址地点
- if entity.label!=5 and entity.values[entity.label]>0.5:
- return True
- if ent.is_tail==True:
- return False
- entity_left = text[max(0,entity.wordOffset_begin-10):entity.wordOffset_begin]
- entity_left = re.sub(",()\(\)","",entity_left)
- entity_left = entity_left[-5:]
- if re.search("地址|地点|银行[::]",entity_left):
- return False
- else:
- return True
- pre_entity = []
- for ent in list_entity:
- if (ent.entity_type in ['company','org','phone'] and is_company(ent,list_sentence[ent.sentence_index].sentence_text)) or (ent.entity_type=='person' and ent.label in [1,2,3]) \
- or (ent.entity_type=='location' and len(ent.entity_text)>5):
- pre_entity.append(ent)
- text_data,pre_data = relationExtraction_model.encode(pre_entity + phone_entitys, list_sentence)
- # print(pre_data)
- maxlen = 512
- relation_list = []
- if 0<len(text_data)<=maxlen:
- relation_list = relationExtraction_model.predict(text_data, pre_data)
- else:
- # 公告大于maxlen时,分段预测
- start = 0
- # print("len(pre_data)",len(pre_data))
- temp_data = []
- deal_data = 0
- while start<len(pre_data):
- _pre_data = pre_data[start:start+maxlen]
- _text_data = text_data[start:start+maxlen]
- if relationExtraction_model.check_data(_pre_data):
- temp_data.append((_text_data,_pre_data))
- else:
- if temp_data:
- deal_data += len(temp_data)
- if deal_data>4:
- break
- for _text_data, _pre_data in temp_data:
- relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data))
- temp_data = []
- start = start + maxlen - 120
- if temp_data:
- deal_data += len(temp_data)
- if deal_data <= 4:
- for _text_data, _pre_data in temp_data:
- relation_list.extend(relationExtraction_model.predict(_text_data, _pre_data))
- # print("预测数据:",len(temp_data))
- # 去重结果
- relation_list = list(set(relation_list))
- # print([(rel[0].entity_text,rel[2].entity_text) for rel in relation_list])
- right_combination = [('org','person'),('company','person'),('company','location'),('org','location'),('person','phone')]
- linked_company = set()
- linked_person = set()
- linked_connetPerson = set()
- linked_phone = set()
- for predicate in ["rel_address","rel_phone","rel_person"]:
- _match_list = []
- _match_combo = []
- for relation in relation_list:
- _subject = relation[0]
- _object = relation[2]
- if isinstance(_subject,Entity) and isinstance(_object,Entity) and (_subject.entity_type,_object.entity_type) in right_combination:
- if _subject.in_attachment != _object.in_attachment:
- continue
- if relation[1]==predicate:
- distance = (tokens_num_dict[_object.sentence_index] + _object.begin_index) - (
- tokens_num_dict[_subject.sentence_index] + _subject.end_index)
- if predicate=="rel_person":
- if (_subject.label==0 and _object.entity_text in agency_contact ) or (_subject.label==1 and _object.entity_text in tenderee_contact):
- continue
- # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
- if _subject.label in [2,3,4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系|^联系人|请.{0,4}联系",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-10):_object.wordOffset_begin]):
- continue
- # 角色为招标/代理人,排除"纪检|监察"相关的联系人
- if _subject.label in [0,1] and re.search("纪检|监察",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin - 10):_object.wordOffset_begin]):
- continue
- if _object.sentence_index!=0 and _object.wordOffset_begin<=10:
- if _subject.label in [2, 3, 4] and re.search("请.{0,4}联系",
- list_sentence[_object.sentence_index-1].sentence_text[-10:]+
- list_sentence[_object.sentence_index].sentence_text[0:_object.wordOffset_begin]):
- continue
- # 角色为中标候选人,排除距离过远的联系人
- if _subject.label in [2, 3, 4] and distance>=40:
- continue
- if distance>0:
- value = (-1 / 2 * (distance ** 2))/10000
- else:
- distance = abs(distance)
- value = (-1 / 2 * (distance ** 2))
- _match_list.append(Match(_subject,_object,value))
- _match_combo.append((_subject,_object))
- match_result = dispatch(_match_list)
- error_list = []
- for mat in list(set(_match_combo)-set(match_result)):
- for temp in match_result:
- if mat[1]==temp[1] and mat[0]!=temp[0]:
- error_list.append(mat)
- break
- result = list(set(_match_combo)-set(error_list))
- if predicate=='rel_person':
- # 从后往前更新状态,已近后向链接的属性不在前向链接(解决错误链接)
- result = sorted(result,key=lambda x:x[1].begin_index,reverse=True)
- for combo in result:
- is_continue = False
- if not combo[0].pointer_person:
- combo[0].pointer_person = []
- if combo[1].begin_index<combo[0].begin_index:
- if combo[0].pointer_person:
- for temp in combo[0].pointer_person:
- if temp.begin_index>combo[0].begin_index:
- is_continue = True
- break
- if is_continue:
- continue
- combo[0].pointer_person.append(combo[1])
- linked_company.add(combo[0])
- linked_person.add(combo[1])
- # print(1,combo[0].entity_text,combo[1].entity_text)
- if predicate=='rel_address':
- result = sorted(result,key=lambda x:x[1].begin_index,reverse=True)
- for combo in result:
- if combo[0].pointer_address:
- continue
- combo[0].pointer_address = combo[1]
- # print(2,combo[0].entity_text,combo[1].entity_text)
- if predicate=='rel_phone':
- result = sorted(result,key=lambda x:x[1].begin_index,reverse=True)
- for combo in result:
- is_continue = False
- if not combo[0].person_phone:
- combo[0].person_phone = []
- if combo[1].begin_index<combo[0].begin_index:
- if combo[0].person_phone:
- for temp in combo[0].person_phone:
- if temp.begin_index>combo[0].begin_index:
- is_continue = True
- break
- if is_continue: continue
- combo[0].person_phone.append(combo[1])
- linked_connetPerson.add(combo[0])
- linked_phone.add(combo[1])
- if combo[0].label in [1,2]:
- if PackDict.get("Project"):
- for i in range(len(PackDict["Project"]["roleList"])):
- if (combo[0].label==1 and PackDict["Project"]["roleList"][i].role_name=='tenderee') \
- or (combo[0].label==2 and PackDict["Project"]["roleList"][i].role_name=='agency'):
- PackDict["Project"]["roleList"][i].linklist.append((combo[0].entity_text,combo[1].entity_text))
- break
- # print(3,combo[0].entity_text,combo[1].entity_text)
- # "公司——地址" 链接规则补充
- company_lacation_EntityList = [ent for ent in pre_entity if ent.entity_type in ['company', 'org', 'location']]
- # company_lacation_EntityList = [ent for ent in pre_entity if (ent.entity_type in ['company', 'org'] and ent.label!=5) or ent.entity_type=="location"]
- company_lacation_EntityList = sorted(company_lacation_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
- t_match_list = []
- for ent_idx in range(len(company_lacation_EntityList)):
- entity = company_lacation_EntityList[ent_idx]
- if entity.entity_type in ['company', 'org'] and entity.label!=5:
- match_nums = 0
- company_nums = 0 # 经过其他公司的数量
- location_nums = 0 # 经过电话的数量
- for after_index in range(ent_idx + 1, min(len(company_lacation_EntityList), ent_idx + 5)):
- after_entity = company_lacation_EntityList[after_index]
- if after_entity.entity_type == "location":
- distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
- tokens_num_dict[entity.sentence_index] + entity.end_index)
- location_nums += 1
- if distance > 100 or location_nums >= 3:
- break
- sentence_distance = after_entity.sentence_index - entity.sentence_index
- value = (-1 / 2 * (distance ** 2)) / 10000
- if sentence_distance == 0:
- if distance < 80:
- t_match_list.append(Match(entity, after_entity, value))
- match_nums += 1
- if company_nums:
- break
- else:
- if distance < 50:
- t_match_list.append(Match(entity, after_entity, value))
- match_nums += 1
- if company_nums:
- break
- else:
- # type:company/org
- company_nums += 1
- if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
- break
- if entity.label in [0, 1] and after_entity.label in [2, 3, 4]:
- break
- if entity.label in [0, 1] and after_entity.label not in [0, 1]:
- break
- # km算法分配求解
- # for item in t_match_list:
- # print("loc_rela",item.main_role.entity_text,item.attribute.entity_text)
- relate_location_result = dispatch(t_match_list)
- relate_location_result = sorted(relate_location_result, key=lambda x: (x[0].sentence_index, x[0].begin_index))
- for match in relate_location_result:
- _company = match[0]
- _relation = match[1]
- # print("loc_relation1", _company.entity_text, _relation.entity_text, )
- if not _company.pointer_address:
- # print('loc_relation2',_company.entity_text,_relation.entity_text)
- _company.pointer_address = _relation
- # "联系人——联系电话" 链接规则补充
- person_phone_EntityList = [ent for ent in pre_entity+ phone_entitys if ent.entity_type not in ['company','org','location']]
- person_phone_EntityList = sorted(person_phone_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
- t_match_list = []
- for ent_idx in range(len(person_phone_EntityList)):
- entity = person_phone_EntityList[ent_idx]
- if entity.entity_type=="person":
- match_nums = 0
- person_nums = 0 # 经过其他中联系人的数量
- byNotPerson_match_nums = 0 # 跟在联系人后面的属性
- phone_nums = 0 # 经过电话的数量
- for after_index in range(ent_idx + 1, min(len(person_phone_EntityList), ent_idx + 8)):
- after_entity = person_phone_EntityList[after_index]
- if after_entity.entity_type == "phone":
- distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
- tokens_num_dict[entity.sentence_index] + entity.end_index)
- phone_nums += 1
- if distance>100 or phone_nums>=4:
- break
- sentence_distance = after_entity.sentence_index - entity.sentence_index
- value = (-1 / 2 * (distance ** 2)) / 10000
- if sentence_distance == 0:
- if distance < 70:
- # value = (-1 / 2 * (distance ** 2)) / 10000
- t_match_list.append(Match(entity, after_entity, value))
- match_nums += 1
- if not person_nums:
- byNotPerson_match_nums += 1
- else:
- break
- else:
- if distance < 40:
- # value = (-1 / 2 * (distance ** 2)) / 10000
- t_match_list.append(Match(entity, after_entity, value))
- match_nums += 1
- if not person_nums:
- byNotPerson_match_nums += 1
- else:
- break
- else:
- person_nums += 1
- # 前向查找属性
- if ent_idx != 0 and (not match_nums or not byNotPerson_match_nums):
- previous_entity = person_phone_EntityList[ent_idx - 1]
- if previous_entity.entity_type == 'phone':
- # if previous_entity.sentence_index == entity.sentence_index:
- distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
- tokens_num_dict[previous_entity.sentence_index] + previous_entity.end_index)
- if distance < 40:
- # 前向 没有 /10000
- value = (-1 / 2 * (distance ** 2))
- t_match_list.append(Match(entity, previous_entity, value))
- # km算法分配求解(person-phone)
- t_match_list = [mat for mat in t_match_list if mat.main_role not in linked_connetPerson and mat.attribute not in linked_phone]
- personphone_result = dispatch(t_match_list)
- personphone_result = sorted(personphone_result, key=lambda x: (x[0].sentence_index, x[0].begin_index))
- for match in personphone_result:
- _person = match[0]
- _phone = match[1]
- if not _person.person_phone:
- _person.person_phone = []
- _person.person_phone.append(_phone)
- # 多个招标人/代理人或者别称
- for idx in range(1,len(pre_entity)):
- _pre_entity = pre_entity[idx]
- if _pre_entity in linked_company and _pre_entity.label==5:
- last_ent = pre_entity[idx-1]
- if last_ent.entity_type in ['company','org'] and last_ent.label in [0,1]:
- if last_ent.sentence_index==_pre_entity.sentence_index:
- mid_text = list_sentence[_pre_entity.sentence_index].sentence_text[last_ent.wordOffset_end:_pre_entity.wordOffset_begin]
- if len(mid_text)<=20 and "," not in mid_text and re.search("[、\((]",mid_text):
- _pre_entity.label = last_ent.label
- _pre_entity.values[last_ent.label] = 0.6
- # 2022/01/25 固定电话可连多个联系人
- temp_person_entitys = [entity for entity in pre_entity if entity.entity_type == 'person']
- temp_person_entitys2 = [] #和固定电话相连的联系人
- for entity in temp_person_entitys:
- if entity.person_phone:
- for _phone in entity.person_phone:
- if not re.search("^1[3-9]\d{9}$", _phone.entity_text):
- temp_person_entitys2.append(entity)
- break
- for index in range(len(temp_person_entitys)):
- entity = temp_person_entitys[index]
- if entity in temp_person_entitys2:
- last_person = entity
- for after_index in range(index + 1, min(len(temp_person_entitys), index + 5)):
- after_entity = temp_person_entitys[after_index]
- if after_entity.sentence_index == last_person.sentence_index and after_entity.begin_index - last_person.end_index < 3:
- for _phone in entity.person_phone:
- if not re.search("^1[3-9]\d{9}$", _phone.entity_text):
- if _phone not in after_entity.person_phone:
- after_entity.person_phone.append(_phone)
- last_person = after_entity
- else:
- break
- if index==0:
- continue
- last_person = entity
- for before_index in range(index-1, max(-1,index-5), -1):
- before_entity = temp_person_entitys[before_index]
- if before_entity.sentence_index == last_person.sentence_index and last_person.begin_index - before_entity.end_index < 3:
- for _phone in entity.person_phone:
- if not re.search("^1[3-9]\d{9}$", _phone.entity_text):
- if _phone not in before_entity.person_phone:
- before_entity.person_phone.append(_phone)
- last_person = before_entity
- else:
- break
- # 更新person为招标/代理联系人的联系方式
- for k in PackDict.keys():
- for i in range(len(PackDict[k]["roleList"])):
- if PackDict[k]["roleList"][i].role_name == "tenderee":
- for _person in person_list:
- if _person.label==1:#招标联系人
- person_phone = [phone for phone in _person.person_phone] if _person.person_phone else []
- for _p in person_phone:
- PackDict[k]["roleList"][i].linklist.append((_person.entity_text, _p.entity_text))
- if not person_phone:
- PackDict[k]["roleList"][i].linklist.append((_person.entity_text,""))
- if PackDict[k]["roleList"][i].role_name == "agency":
- for _person in person_list:
- if _person.label==2:#代理联系人
- person_phone = [phone for phone in _person.person_phone] if _person.person_phone else []
- for _p in person_phone:
- PackDict[k]["roleList"][i].linklist.append((_person.entity_text, _p.entity_text))
- if not person_phone:
- PackDict[k]["roleList"][i].linklist.append((_person.entity_text,""))
- # 更新 PackDict
- not_sure_linked = []
- for link_p in list(linked_company):
- for k in PackDict.keys():
- for i in range(len(PackDict[k]["roleList"])):
- if PackDict[k]["roleList"][i].role_name == "tenderee":
- if PackDict[k]["roleList"][i].entity_text != link_p.entity_text and link_p.label == 0:
- not_sure_linked.append(link_p)
- continue
- if PackDict[k]["roleList"][i].entity_text == link_p.entity_text:
- for per in link_p.pointer_person:
- person_phone = [phone for phone in per.person_phone] if per.person_phone else []
- if not person_phone:
- if per.entity_text not in agency_contact:
- PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
- continue
- for _p in person_phone:
- if per.entity_text not in agency_contact and _p.entity_text not in agency_phone:
- PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
- elif PackDict[k]["roleList"][i].role_name == "agency":
- if PackDict[k]["roleList"][i].entity_text != link_p.entity_text and link_p.label == 1:
- not_sure_linked.append(link_p)
- continue
- if PackDict[k]["roleList"][i].entity_text == link_p.entity_text:
- for per in link_p.pointer_person:
- person_phone = [phone for phone in per.person_phone] if per.person_phone else []
- if not person_phone:
- if per.entity_text not in tenderee_contact:
- PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
- continue
- for _p in person_phone:
- if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone:
- PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
- else:
- if PackDict[k]["roleList"][i].entity_text == link_p.entity_text:
- for per in link_p.pointer_person:
- person_phone = [phone for phone in per.person_phone] if per.person_phone else []
- if not person_phone:
- if per.entity_text not in tenderee_contact and per.entity_text not in agency_contact:
- PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
- winter_contact.add(per.entity_text)
- continue
- for _p in person_phone:
- if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and \
- per.entity_text not in agency_contact and _p.entity_text not in agency_phone:
- PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
- winter_contact.add(per.entity_text)
- # 更新org/company实体label为0,1的链接
- for link_p in not_sure_linked:
- for k in PackDict.keys():
- for i in range(len(PackDict[k]["roleList"])):
- if PackDict[k]["roleList"][i].role_name == "tenderee":
- if link_p.label == 0:
- for per in link_p.pointer_person:
- person_phone = [phone for phone in per.person_phone] if per.person_phone else []
- if not person_phone:
- if per.entity_text not in agency_contact and per.entity_text not in winter_contact:
- PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
- continue
- for _p in person_phone:
- if per.entity_text not in agency_contact and _p.entity_text not in agency_phone and per.entity_text not in winter_contact:
- PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
- elif PackDict[k]["roleList"][i].role_name == "agency":
- if link_p.label == 1:
- for per in link_p.pointer_person:
- person_phone = [phone for phone in per.person_phone] if per.person_phone else []
- if not person_phone:
- if per.entity_text not in tenderee_contact and per.entity_text not in winter_contact:
- PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
- continue
- for _p in person_phone:
- if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and per.entity_text not in winter_contact:
- PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
- re_split = re.compile("[^\u4e00-\u9fa5、](十一|十二|十三|十四|十五|一|二|三|四|五|六|七|八|九|十)、")
- split_list = [0] * 16
- split_dict = {
- "一、": 1,
- "二、": 2,
- "三、": 3,
- "四、": 4,
- "五、": 5,
- "六、": 6,
- "七、": 7,
- "八、": 8,
- "九、": 9,
- "十、": 10,
- "十一、": 11,
- "十二、": 12,
- "十三、": 13,
- "十四、": 14,
- "十五、": 15
- }
- for item in re.finditer(re_split, _content):
- _index = split_dict.get(item.group()[1:])
- if not split_list[_index]:
- split_list[_index] = item.span()[0] + 1
- split_list = [i for i in split_list if i != 0]
- start = 0
- new_split_list = []
- for idx in split_list:
- new_split_list.append((start, idx))
- start = idx
- new_split_list.append((start, len(_content)))
- # 实体列表按照“公告分段”分组
- words_num_dict = dict()
- last_words_num = 0
- for sentence in list_sentence:
- _index = sentence.sentence_index
- if _index == 0:
- words_num_dict[_index] = 0
- else:
- words_num_dict[_index] = words_num_dict[_index - 1] + last_words_num
- last_words_num = len(sentence.sentence_text)
- # 公司-联系人连接(km算法)
- re_phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
- '\+86.?1[3-9]\d{9}|'
- # '0[1-9]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
- '0[1-9]\d{1,2}[-—-―][2-9]\d{6,7}[^\d]?转\d{1,4}|'
- '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?[-—-―]\d{1,4}|'
- '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=1[3-9]\d{9})|'
- '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?)|'
- '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=[2-9]\d{6,7})|'
- '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?|'
- '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6,7}-?\d{,4}|'
- '400\d{7}转\d{1,4}|'
- '[2-9]\d{6,7}')
- key_phone = re.compile("联系方式|电话|联系人|负责人")
- temporary_list2 = []
- for entity in list_entity:
- # if entity.entity_type in ['org', 'company', 'person'] and entity.is_tail==False:
- if entity.entity_type in ['org', 'company', 'person']:
- temporary_list2.append(entity)
- temporary_list2 = sorted(temporary_list2, key=lambda x: (x.sentence_index, x.begin_index))
- new_temporary_list2 = []
- for _split in new_split_list:
- temp_list = []
- for _entity in temporary_list2:
- if words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[0] and words_num_dict[
- _entity.sentence_index] + _entity.wordOffset_end < _split[1]:
- temp_list.append(_entity)
- elif words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[1]:
- break
- new_temporary_list2.append(temp_list)
- # print(new_temporary_list2)
- match_list2 = []
- for split_index in range(len(new_temporary_list2)):
- split_entitys = new_temporary_list2[split_index]
- if len(split_entitys)<=1:
- continue
- is_skip = False
- for index in range(len(split_entitys)):
- entity = split_entitys[index]
- if is_skip:
- is_skip = False
- continue
- else:
- if entity.entity_type in ['org', 'company']:
- if entity.label != 5 or entity.entity_text in roleSet:
- match_nums = 0
- for after_index in range(index + 1, min(len(split_entitys), index + 4)):
- after_entity = split_entitys[after_index]
- if entity.in_attachment != after_entity.in_attachment:
- break
- if after_entity.entity_type in ['person']:
- distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
- tokens_num_dict[entity.sentence_index] + entity.end_index)
- # 实体为中标人/候选人,联系人已确定类别【1,2】
- if entity.label in [2, 3, 4] and after_entity.label in [1, 2]:
- break
- if entity.label in [2, 3, 4] and distance>=20:
- break
- # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
- if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
- break
- # 角色为招标/代理人,排除"纪检|监察"相关的联系人
- if entity.label in [0,1] and re.search("纪检|监察",list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
- break
- if after_entity.sentence_index != 0 and after_entity.wordOffset_begin <= 10:
- if entity.label in [2, 3, 4] and re.search("请.{0,5}联系",
- list_sentence[after_entity.sentence_index - 1].sentence_text[-10:] +
- list_sentence[after_entity.sentence_index].sentence_text[0:after_entity.wordOffset_begin]):
- continue
- if after_entity.label in [1, 2, 3]:
- # distance = (tokens_num_dict[
- # after_entity.sentence_index] + after_entity.begin_index) - (
- # tokens_num_dict[entity.sentence_index] + entity.end_index)
- sentence_distance = after_entity.sentence_index - entity.sentence_index
- if sentence_distance == 0:
- if distance < 100:
- if entity.label in [2, 3, 4] and distance>40:
- break
- if (entity.label == 0 and after_entity.label == 1) or (
- entity.label == 1 and after_entity.label == 2):
- distance = distance / 100
- value = (-1 / 2 * (distance ** 2)) / 10000
- match_list2.append(Match(entity, after_entity, value))
- match_nums += 1
- else:
- if distance < 60:
- if entity.label in [2, 3, 4] and distance>20:
- break
- if (entity.label == 0 and after_entity.label == 1) or (
- entity.label == 1 and after_entity.label == 2):
- distance = distance / 100
- value = (-1 / 2 * (distance ** 2)) / 10000
- match_list2.append(Match(entity, after_entity, value))
- match_nums += 1
- if after_entity.entity_type in ['org', 'company']:
- if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
- break
- # 解决在‘地址’中识别出org/company的问题
- # if entity.label in [0,1] and after_index==index+1 and after_entity.label not in [0,1]:
- if entity.label != 5 and after_index == index + 1 and (
- after_entity.label == entity.label or after_entity.label == 5):
- distance = (tokens_num_dict[
- after_entity.sentence_index] + after_entity.begin_index) - (
- tokens_num_dict[entity.sentence_index] + entity.end_index)
- if distance < 20:
- after_entity_left = list_sentence[after_entity.sentence_index].tokens[max(0,
- after_entity.begin_index - 10):after_entity.begin_index]
- after_entity_right = list_sentence[after_entity.sentence_index].tokens[
- after_entity.end_index + 1:after_entity.end_index + 6]
- after_entity_left = "".join(after_entity_left)
- if len(after_entity_left) > 20:
- after_entity_left = after_entity_left[-20:]
- after_entity_right = "".join(after_entity_right)[:10]
- if re.search("地,?址", after_entity_left):
- is_skip = True
- continue
- if re.search("\(|(", after_entity_left) and re.search("\)|)",after_entity_right):
- is_skip = True
- continue
- if entity.label in [0, 1] and after_entity.label in [0, 1] and entity.label == after_entity.label:
- break
- if entity.label in [0, 1] and after_entity.label in [0, 1] and split_entitys[
- index + 1].entity_type == "person":
- break
- if entity.label in [0, 1 ,5] and after_entity.label in [2, 3, 4]:
- break
- if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
- break
- # 搜索没有联系人的电话
- mid_tokens = []
- is_same_sentence = False
- if index == len(split_entitys) - 1:
- for i in range(entity.sentence_index, len(list_sentence)):
- mid_tokens += list_sentence[i].tokens
- mid_tokens = mid_tokens[entity.end_index + 1:]
- mid_sentence = "".join(mid_tokens)
- have_phone = re.findall(re_phone, mid_sentence)
- if have_phone:
- if re.findall(re_phone, mid_sentence.split("。")[0]):
- is_same_sentence = True
- _phone = have_phone[0]
- if _phone in [ent.entity_text for ent in phone_entitys]:
- phone_begin = mid_sentence.find(_phone)
- if words_num_dict[entity.sentence_index] + entity.wordOffset_begin + phone_begin < \
- new_split_list[split_index][1]:
- mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "")
- if re.search(key_phone, mid_sentence):
- if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系",mid_sentence[-8:]):
- pass
- else:
- distance = 1
- if is_same_sentence:
- if phone_begin <= 200:
- if entity.label in [2,3,4] and phone_begin>80:
- break
- value = (-1 / 2 * (distance ** 2)) / 10000
- match_list2.append(Match(entity, (entity, _phone), value))
- match_nums += 1
- else:
- if phone_begin <= 60:
- if entity.label in [2,3,4] and phone_begin>40:
- break
- value = (-1 / 2 * (distance ** 2)) / 10000
- match_list2.append(Match(entity, (entity, _phone), value))
- match_nums += 1
- else:
- next_entity = split_entitys[index + 1]
- if next_entity.entity_type in ["org","company"]:
- _entity_left = list_sentence[next_entity.sentence_index].sentence_text[entity.wordOffset_end:next_entity.wordOffset_begin]
- _entity_left2 = re.sub(",()\(\)::", "", _entity_left)
- _entity_left2 = _entity_left2[-5:]
- if re.search("(地,?址|地,?点)[::][^,。]*$", _entity_left) or re.search("地址|地点", _entity_left2):
- if index + 2<= len(split_entitys) - 1:
- next_entity = split_entitys[index + 2]
- if len(_entity_left)<=2 and re.search("[、(\(]",_entity_left):
- if index + 2 <= len(split_entitys) - 1:
- next_entity = split_entitys[index + 2]
- if entity.sentence_index == next_entity.sentence_index:
- mid_tokens += list_sentence[entity.sentence_index].tokens[
- entity.end_index + 1:next_entity.begin_index]
- else:
- sentence_index = entity.sentence_index
- while sentence_index <= next_entity.sentence_index:
- mid_tokens += list_sentence[sentence_index].tokens
- sentence_index += 1
- mid_tokens = mid_tokens[entity.end_index + 1:-(len(
- list_sentence[next_entity.sentence_index].tokens) - next_entity.begin_index) + 1]
- mid_sentence = "".join(mid_tokens)
- have_phone = re.findall(re_phone, mid_sentence)
- if have_phone:
- if re.findall(re_phone, mid_sentence.split("。")[0]):
- is_same_sentence = True
- _phone = have_phone[0]
- if _phone in [ent.entity_text for ent in phone_entitys]:
- phone_begin = mid_sentence.find(_phone)
- mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "")
- if re.search(key_phone, mid_sentence):
- p_phone = [p.entity_text for p in next_entity.person_phone] if next_entity.person_phone else []
- if next_entity.entity_type == 'person' and _phone in p_phone:
- pass
- elif entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系", mid_sentence[-8:]):
- pass
- else:
- distance = (tokens_num_dict[
- next_entity.sentence_index] + next_entity.begin_index) - (
- tokens_num_dict[entity.sentence_index] + entity.end_index)
- distance = distance / 2
- if is_same_sentence:
- if phone_begin <= 200:
- value = (-1 / 2 * (distance ** 2)) / 10000
- match_list2.append(Match(entity, (entity, _phone), value))
- match_nums += 1
- else:
- if phone_begin <= 60:
- value = (-1 / 2 * (distance ** 2)) / 10000
- match_list2.append(Match(entity, (entity, _phone), value))
- match_nums += 1
- # 实体无匹配时,尝试前向查找匹配
- if not match_nums:
- if (entity.label != 5 or entity.entity_text in roleSet) and entity.values[entity.label] >= 0.5 and index != 0:
- previous_entity = split_entitys[index - 1]
- if previous_entity.entity_type == 'person' and previous_entity.label in [1, 2, 3]:
- if entity.label in [2, 3, 4] and previous_entity.label in [1, 2]:
- continue
- if previous_entity.sentence_index == entity.sentence_index:
- distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
- tokens_num_dict[
- previous_entity.sentence_index] + previous_entity.end_index)
- if distance < 20:
- # 距离相等时,前向添加处罚值
- # distance += 1
- # 前向 没有 /10000
- value = (-1 / 2 * (distance ** 2))
- match_list2.append(Match(entity, previous_entity, value))
- # print(match_list2)
- # print([(mat.main_role.entity_text,mat.attribute.entity_text if not isinstance(mat.attribute, tuple) else mat.attribute[1]) for mat in match_list2])
- match_list2 = [mat for mat in match_list2 if mat.main_role not in linked_company and mat.attribute not in linked_person]
- # print(match_list2)
- # print([(mat.main_role.entity_text,mat.attribute.entity_text if not isinstance(mat.attribute, tuple) else mat.attribute[1]) for mat in match_list2])
- # km算法分配求解
- result2 = dispatch(match_list2)
- # print(result2)
- for match in result2:
- entity = match[0]
- # print(entity.entity_text)
- # print(entity.label)
- # print(match.attribute)
- entity_index = list_entity.index(entity)
- is_update = False
- if isinstance(match[1], tuple):
- person_ = ''
- phone_ = match[1][1].split("/") # 分割多个号码
- # print(person_,phone_)
- else:
- person_ = match[1].entity_text
- phone_ = [i.entity_text for i in match[1].person_phone] if match[1].person_phone else []
- for k in PackDict.keys():
- for i in range(len(PackDict[k]["roleList"])):
- if PackDict[k]["roleList"][i].role_name == "tenderee":
- # if not PackDict[k]["roleList"][i].linklist:
- if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 0:
- if person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0 and person_ not in winter_contact:
- if not phone_:
- PackDict[k]["roleList"][i].linklist.append((person_, ""))
- for p in phone_:
- # if not person_ and len()
- PackDict[k]["roleList"][i].linklist.append((person_, p))
- is_update = True
- elif PackDict[k]["roleList"][i].role_name == "agency":
- # if not PackDict[k]["roleList"][i].linklist:
- if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 1 and person_ not in winter_contact:
- if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0:
- if not phone_:
- PackDict[k]["roleList"][i].linklist.append((person_, ""))
- for p in phone_:
- PackDict[k]["roleList"][i].linklist.append((person_, p))
- is_update = True
- else:
- if PackDict[k]["roleList"][i].entity_text == entity.entity_text:
- # if not PackDict[k]["roleList"][i].linklist:
- if len([item for item in PackDict[k]["roleList"][i].linklist if item[1]])==0: # 有联系人但无联系方式(号码)
- if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0 and \
- person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0:
- if not phone_:
- PackDict[k]["roleList"][i].linklist.append((person_, ""))
- for p in phone_:
- PackDict[k]["roleList"][i].linklist.append((person_, p))
- is_update = True
- if not person_:
- is_update = False
- if is_update:
- # 更新 list_entity
- if not list_entity[entity_index].pointer_person:
- list_entity[entity_index].pointer_person = []
- list_entity[entity_index].pointer_person.append(match[1])
- # print('tenderee_contact',tenderee_contact)
- # print('tenderee_phone',tenderee_phone)
- # print('agency_contact',agency_contact)
- # print('agency_phone',agency_phone)
- # print('PackDict')
- # for k in PackDict.keys():
- # for i in range(len(PackDict[k]["roleList"])):
- # print(PackDict[k]["roleList"][i].role_name)
- # print(PackDict[k]["roleList"][i].entity_text)
- # print(PackDict[k]["roleList"][i].linklist)
- linked_person = []
- linked_persons_with = []
- for company_entity in [entity for entity in list_entity if entity.entity_type in ['company','org']]:
- if company_entity.pointer_person:
- for _person in company_entity.pointer_person:
- linked_person.append(_person)
- linked_persons_with.append(company_entity)
- # 一个公司对应多个联系人的补充
- person_entitys = [entity for entity in list_entity if entity.entity_type=='person']
- person_entitys = person_entitys[::-1]
- for index in range(len(person_entitys)):
- entity = person_entitys[index]
- prepare_link = []
- if entity not in linked_person:
- prepare_link.append(entity)
- last_person = entity
- for after_index in range(index + 1, min(len(person_entitys), index + 5)):
- after_entity = person_entitys[after_index]
- if after_entity.sentence_index==last_person.sentence_index and last_person.begin_index-after_entity.end_index<5:
- if after_entity in linked_person:
- _index = linked_person.index(after_entity)
- with_company = linked_persons_with[_index]
- for i in range(len(PackDict["Project"]["roleList"])):
- if PackDict["Project"]["roleList"][i].role_name == "tenderee":
- if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text or with_company.label == 0:
- for item in prepare_link:
- person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else []
- for _p in person_phone:
- PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p))
- with_company.pointer_person.append(item)
- linked_person.append(item)
- elif PackDict["Project"]["roleList"][i].role_name == "agency":
- if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text or with_company.label == 1:
- for item in prepare_link:
- person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else []
- for _p in person_phone:
- PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p))
- with_company.pointer_person.append(item)
- linked_person.append(item)
- else:
- if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text:
- for item in prepare_link:
- person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else []
- for _p in person_phone:
- PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p))
- with_company.pointer_person.append(item)
- linked_person.append(item)
- break
- else:
- prepare_link.append(after_entity)
- last_person = after_entity
- continue
- # 统一同类角色的属性
- for k in PackDict.keys():
- for i in range(len(PackDict[k]["roleList"])):
- for _entity in list_entity:
- if _entity.entity_type in ['org','company']:
- is_same = False
- is_similar = False
- # entity_text相同
- if _entity.entity_text==PackDict[k]["roleList"][i].entity_text:
- is_same = True
- # entity.label为【0,1】
- if _entity.label in [0,1] and dict_role_id[str(_entity.label)]==PackDict[k]["roleList"][i].role_name:
- is_similar = True
- if is_same:
- linked_entitys = _entity.linked_entitys
- if linked_entitys:
- for linked_entity in linked_entitys:
- pointer_person = linked_entity.pointer_person if linked_entity.pointer_person else []
- for _pointer_person in pointer_person:
- _phone = [p.entity_text for p in _pointer_person.person_phone] if _pointer_person.person_phone else []
- for _p in _phone:
- if (_pointer_person.entity_text,_p) not in PackDict[k]["roleList"][i].linklist:
- PackDict[k]["roleList"][i].linklist.append((_pointer_person.entity_text,_p))
- elif is_similar:
- pointer_person = _entity.pointer_person if _entity.pointer_person else []
- for _pointer_person in pointer_person:
- _phone = [p.entity_text for p in _pointer_person.person_phone] if _pointer_person.person_phone else []
- for _p in _phone:
- if (_pointer_person.entity_text, _p) not in PackDict[k]["roleList"][i].linklist:
- PackDict[k]["roleList"][i].linklist.append(
- (_pointer_person.entity_text, _p))
- # "roleList"中联系人电话去重
- tenderee_agency_phone = []
- for k in PackDict.keys():
- for i in range(len(PackDict[k]["roleList"])):
- if PackDict[k]["roleList"][i].role_name in ['agency','tenderee']:
- tenderee_agency_phone.extend([person_phone[1] for person_phone in PackDict[k]["roleList"][i].linklist if person_phone[1]])
- # 带有联系人的电话
- with_person = [person_phone[1] for person_phone in PackDict[k]["roleList"][i].linklist if person_phone[0]]
- # 带有电话的联系人
- with_phone = [person_phone[0] for person_phone in PackDict[k]["roleList"][i].linklist if person_phone[1]]
- remove_list = []
- for item in PackDict[k]["roleList"][i].linklist:
- if not item[0]:
- if item[1] in with_person:
- # 删除重复的无联系人电话
- remove_list.append(item)
- elif not item[1]:
- if item[0] in with_phone:
- remove_list.append(item)
- for _item in remove_list:
- PackDict[k]["roleList"][i].linklist.remove(_item)
- # 中标候选人联系方式异常排除
- for k in PackDict.keys():
- for i in range(len(PackDict[k]["roleList"])):
- if PackDict[k]["roleList"][i].role_name in ['win_tenderer', 'second_tenderer','third_tenderer']:
- if tenderee_agency_phone:
- remove_list = []
- for item in PackDict[k]["roleList"][i].linklist:
- if item[1] and item[1] in tenderee_agency_phone:
- remove_list.append(item)
- for _item in remove_list:
- PackDict[k]["roleList"][i].linklist.remove(_item)
- # else:
- # # 公告中无招标代理联系方式时,可排除中标联系方式
- # remove_list = []
- # for _item in PackDict[k]["roleList"][i].linklist:
- # # 有联系方式
- # if _item[1]:
- # remove_list.append(_item)
- # for _item in remove_list:
- # PackDict[k]["roleList"][i].linklist.remove(_item)
- # PackDict更新company/org地址
- last_role_prob = {}
- for ent in pre_entity:
- if ent.entity_type in ['company','org']:
- if ent.pointer_address:
- for k in PackDict.keys():
- for i in range(len(PackDict[k]["roleList"])):
- if PackDict[k]["roleList"][i].entity_text == ent.entity_text:
- if not PackDict[k]["roleList"][i].address:
- PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
- last_role_prob[PackDict[k]["roleList"][i].role_name] = ent.values[role2id_dict[PackDict[k]["roleList"][i].role_name]]
- else:
- if PackDict[k]["roleList"][i].role_name in ['tenderee','agency']:
- # 角色为招标/代理人时,取其实体概率高的链接地址作为角色address
- if ent.values[role2id_dict[PackDict[k]["roleList"][i].role_name]] > last_role_prob[PackDict[k]["roleList"][i].role_name]:
- PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
- last_role_prob[PackDict[k]["roleList"][i].role_name] = ent.values[role2id_dict[PackDict[k]["roleList"][i].role_name]]
- else:
- if len(ent.pointer_address.entity_text) > len(PackDict[k]["roleList"][i].address):
- PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
- # 联系人——电子邮箱链接
- temporary_list3 = [entity for entity in list_entity if entity.entity_type=='email' or (entity.entity_type=='person' and entity.label in [1,2,3])]
- temporary_list3 = sorted(temporary_list3, key=lambda x: (x.sentence_index, x.begin_index))
- new_temporary_list3 = []
- for _split in new_split_list:
- temp_list = []
- for _entity in temporary_list3:
- if words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[0] and words_num_dict[
- _entity.sentence_index] + _entity.wordOffset_end < _split[1]:
- temp_list.append(_entity)
- elif words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[1]:
- break
- new_temporary_list3.append(temp_list)
- # print(new_temporary_list3)
- match_list3 = []
- for split_index in range(len(new_temporary_list3)):
- split_entitys = new_temporary_list3[split_index]
- for index in range(len(split_entitys)):
- entity = split_entitys[index]
- if entity.entity_type == 'person':
- match_nums = 0
- for after_index in range(index + 1, min(len(split_entitys), index + 4)):
- after_entity = split_entitys[after_index]
- if match_nums > 2:
- break
- if after_entity.entity_type == 'email':
- distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
- tokens_num_dict[entity.sentence_index] + entity.end_index)
- sentence_distance = after_entity.sentence_index - entity.sentence_index
- if sentence_distance == 0:
- if distance < 100:
- if (entity.label == 0 and after_entity.label == 1) or (
- entity.label == 1 and after_entity.label == 2):
- distance = distance / 100
- value = (-1 / 2 * (distance ** 2)) / 10000
- match_list3.append(Match(entity, after_entity, value))
- match_nums += 1
- else:
- if distance < 60:
- if (entity.label == 0 and after_entity.label == 1) or (
- entity.label == 1 and after_entity.label == 2):
- distance = distance / 100
- value = (-1 / 2 * (distance ** 2)) / 10000
- match_list3.append(Match(entity, after_entity, value))
- match_nums += 1
- # 前向查找匹配
- # if not match_nums:
- if index != 0:
- previous_entity = split_entitys[index - 1]
- if previous_entity.entity_type == 'email':
- if previous_entity.sentence_index == entity.sentence_index:
- distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
- tokens_num_dict[
- previous_entity.sentence_index] + previous_entity.end_index)
- if distance < 30:
- # 距离相等时,前向添加处罚值
- # distance += 1
- # 前向 没有 /10000
- value = (-1 / 2 * (distance ** 2))
- match_list3.append(Match(entity, previous_entity, value))
- # print(match_list3)
- # km算法分配求解
- result3 = dispatch(match_list3)
- for match in result3:
- match_person = match[0]
- match_email = match[1]
- match_person.pointer_email = match_email
- # # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。
- # temp_ent_list = [] # 临时列表,记录0,1角色及3联系人
- # other_person = [] # 阈值以上的联系人列表
- # link_person = [] # 有电话没联系上角色的person列表
- # other_ent = []
- # link_ent = []
- # found_person = False
- # ent_list = []
- # for entity in list_entity:
- # if entity.entity_type in ['org','company','person']:
- # ent_list.append(entity)
- # # ent_list = [entity for entity in list_entity if entity.entity_type in ['org','company','person']]
- # #for list_index in range(len(ent_list)):
- # #if ent_list[list_index].entity_type in ['org','company'] and ent_list[list_index].label == 0 and list_index+2<len(ent_list) and \
- # #ent_list[list_index+1].entity_type in ['org','company'] and ent_list[list_index+1].label == 1 and ent_list[list_index+2].entity_type in ['person']:
- # #ent_list[list_index+1], ent_list[list_index+2] = ent_list[list_index+2], ent_list[list_index+1]
- # # 2020/11/25增加确定角色联系人判断
- # sure_person_set = set([entity.entity_text for entity in ent_list if entity.entity_type == 'person' and entity.label in [1, 2]])
- # # 招标/代理在同一句中交叉情况的处理
- # for index in range(len(ent_list)):
- # entity = ent_list[index]
- # if entity.entity_text in roleSet and entity.label in [0, 1] and index+3<len(ent_list):
- # if entity.sentence_index==ent_list[index+1].sentence_index==ent_list[index+2].sentence_index==ent_list[index+3].sentence_index:
- # if ent_list[index+1].begin_index - entity.end_index < 30:
- # if ent_list[index+1].entity_text in roleSet and ent_list[index+1].label in [0, 1] and entity.label!=ent_list[index+1].label:
- # if ent_list[index+2].entity_type=="person" and ent_list[index+3].entity_type=="person" and \
- # ent_list[index+2].label==3 and ent_list[index+3].label==3:
- # ent_list[index + 1], ent_list[index + 2] = ent_list[index + 2], ent_list[index + 1]
- #
- #
- # for index in range(len(ent_list)):
- # entity = ent_list[index]
- # if entity.entity_type=="person":
- # if str(entity.label) == "0": # 2020/11/25 非联系人直接跳过
- # continue
- # if entity.values[entity.label]>on_value_person:
- # if str(entity.label)=="1":
- # for i in range(len(PackDict["Project"]["roleList"])):
- # if PackDict["Project"]["roleList"][i].role_name=="tenderee":
- # PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
- # link_person.append(entity.entity_text)
- # link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
- # # add pointer_person
- # for _entity in list_entity:
- # if dict_role_id.get(str(_entity.label))=="tenderee":
- # for i in range(len(PackDict["Project"]["roleList"])):
- # if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
- # _entity.pointer_person = entity
- # elif str(entity.label)=="2":
- # for i in range(len(PackDict["Project"]["roleList"])):
- # if PackDict["Project"]["roleList"][i].role_name=="agency":
- # PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
- # link_person.append(entity.entity_text)
- # link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
- # # add pointer_person
- # for _entity in list_entity:
- # if dict_role_id.get(str(_entity.label))=="agency":
- # for i in range(len(PackDict["Project"]["roleList"])):
- # if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
- # _entity.pointer_person = entity
- # elif str(entity.label)=="3":
- # if entity.entity_text in sure_person_set: # 2020/11/25 排除已经确定角色的联系人
- # continue
- # #not_link_person.append((entity_after.entity_text,entity_after.person_phone))
- # other_person.append(entity.entity_text)
- # temp_ent_list.append((entity.entity_text,entity.person_phone,entity))
- #
- # #if entity.entity_text in roleSet:
- # if entity.entity_text in roleSet:
- # if entity.label in [0,1]:
- # other_ent.append(entity.entity_text)
- # temp_ent_list.append((entity.entity_text, entity.label,entity))
- # for behind_index in range(index+1, len(ent_list)):
- # entity_after = ent_list[behind_index]
- # if entity_after.sentence_index-entity.sentence_index>=1 or entity_after.entity_type in ['org','company']: # 只在本句中找联系人
- # break
- # if entity_after.values is not None:
- # if entity_after.entity_type=="person":
- # if str(entity_after.label) == "0": # 2020/11/25角色后面为非联系人 停止继续往后找
- # break
- # if entity_after.values[entity_after.label]>on_value_person:
- # if str(entity_after.label)=="1":
- # for i in range(len(PackDict["Project"]["roleList"])):
- # if PackDict["Project"]["roleList"][i].role_name=="tenderee":
- # PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
- # link_person.append(entity_after.entity_text)
- # link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
- # elif str(entity_after.label)=="2":
- # for i in range(len(PackDict["Project"]["roleList"])):
- # if PackDict["Project"]["roleList"][i].role_name=="agency":
- # PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
- # link_person.append(entity_after.entity_text)
- # link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
- # elif str(entity_after.label)=="3":
- # if entity_after.entity_text in sure_person_set: # 2020/11/25 如果姓名已经出现在确定角色联系人中则停止往后找
- # break
- # elif entity_after.begin_index - entity.end_index > 30:#2020/10/25 如果角色实体与联系人实体间隔大于阈值停止
- # break
- # for pack in PackDict.keys():
- # for i in range(len(PackDict[pack]["roleList"])):
- # if PackDict[pack]["roleList"][i].entity_text==entity.entity_text:
- # #if entity_after.sentence_index-entity.sentence_index>1 and len(roleList[i].linklist)>0:
- # #break
- # PackDict[pack]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
- # link_person.append(entity_after.entity_text)
- # #add pointer_person
- # entity.pointer_person = entity_after
- #
- # not_link_person = [person for person in other_person if person not in link_person]
- # not_link_ent = [ent for ent in other_ent if ent not in link_ent]
- # if len(not_link_person) > 0 and len(not_link_ent) > 0 :
- # item = temp_ent_list
- # for i in range(len(item)):
- # if item[i][0] in not_link_ent and item[i][1] == 0 and i+3 < len(item):
- # if item[i+1][0] in other_ent and item[i+1][1] == 1 and item[i+2][0] in other_person and item[i+3][0] in other_person:
- # item[i+1], item[i+2] = item[i+2], item[i+1]
- # for i in range(len(item)-1, -1, -1):
- # if item[i][0] in not_link_ent:
- # for pack in PackDict.keys():
- # for role in PackDict[pack]["roleList"]:
- # if role.entity_text == item[i][0] and len(role.linklist) < 1:
- # for j in range(i+1, len(item)):
- # if item[j][0] in not_link_person:
- # role.linklist.append(item[j][:2])
- # #add pointer_person
- # item[i][2].pointer_person = item[j][2]
- # break
- # else:
- # break
- # # 电话没有联系人的处理
- # role_with_no_phone = []
- # for i in range(len(PackDict["Project"]["roleList"])):
- # if PackDict["Project"]["roleList"][i].role_name in ["tenderee","agency"]:
- # if len(PackDict["Project"]["roleList"][i].linklist)==0: # 找出没有联系人的招标/代理人
- # role_with_no_phone.append(PackDict["Project"]["roleList"][i].entity_text)
- # else:
- # phone_nums = 0
- # for link in PackDict["Project"]["roleList"][i].linklist:
- # if link[1]:
- # phone_nums += 1
- # break
- # if not phone_nums:
- # role_with_no_phone.append(PackDict["Project"]["roleList"][i].entity_text)
- # if role_with_no_phone:
- # phone_with_person = [entity.person_phone for entity in list_entity if entity.entity_type == "person"]
- # # phone_with_person = [phone for phone in phone_with_person if phone]
- #
- # dict_index_sentence = {}
- # for _sentence in list_sentence:
- # dict_index_sentence[_sentence.sentence_index] = _sentence
- # new_entity_list = [entity for entity in list_entity if entity.entity_type in ['org','company','person']]
- # for index in range(len(new_entity_list)):
- # entity = new_entity_list[index]
- # if entity.entity_text in role_with_no_phone:
- # e_sentence = dict_index_sentence[entity.sentence_index]
- # entity_right = e_sentence.tokens[entity.end_index:entity.end_index+40]
- # entity_right = "".join(entity_right)
- # if index+1<len(new_entity_list) and entity_right.find(new_entity_list[index+1].entity_text)>-1:
- # entity_right = entity_right[:entity_right.find(new_entity_list[index+1].entity_text)]
- # have_phone = re.findall(phone,entity_right)
- # if have_phone:
- # _phone = have_phone[0]
- # phone_begin = entity_right.find(_phone)
- # if _phone not in phone_with_person and re.search(key_phone,entity_right[:phone_begin]):
- # # entity.person_phone = _phone
- # for i in range(len(PackDict["Project"]["roleList"])):
- # if PackDict["Project"]["roleList"][i].entity_text == entity.entity_text:
- # PackDict["Project"]["roleList"][i].linklist.append(('', _phone))
-
- #寻找多标段招标金额
- p_entity = len(list_entity)-1
- set_tenderer_money = set()
- list_tenderer_money = [] #2021/7/16 新增列表,倒序保存所有中标金额
- unit_list = [] #2021/8/17 新增,保存金额单位
- #遍历所有实体
- max_prob = 0 # 保存招标金额最大概率
- while(p_entity>=0):
- entity = list_entity[p_entity]
- if entity.entity_type=="money":
- # 2021/12/03 添加成本警戒线、保证金
- if entity.notes in ['保证金', '成本警戒线']:
- packagePointer, _flag = getPackage(PackageList, entity.sentence_index, entity.begin_index,
- "money-" + str(entity.label), MAX_DIS=2, DIRECT="L")
- if packagePointer is None:
- packageName = "Project"
- else:
- packageName = packagePointer.entity_text
- if packageName == "Project":
- # if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
- # PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text))
- if entity.notes=="保证金" and "bond" not in PackDict["Project"]:
- PackDict["Project"]["bond"] = str(Decimal(entity.entity_text))
- elif entity.notes=="成本警戒线" and "cost_warning" not in PackDict["Project"]:
- PackDict["Project"]["cost_warning"] = str(Decimal(entity.entity_text))
- else:
- if entity.notes == "保证金" and "bond" not in PackDict[packageName]:
- PackDict[packageName]["bond"] = str(Decimal(entity.entity_text))
- elif entity.notes == "成本警戒线" and "cost_warning" not in PackDict[packageName]:
- PackDict[packageName]["cost_warning"] = str(Decimal(entity.entity_text))
- elif entity.values[entity.label]>=on_value:
- if str(entity.label)=="1" and entity.notes != '单价':
- set_tenderer_money.add(float(entity.entity_text))
- list_tenderer_money.append(float(entity.entity_text)) # 2021/7/16 新增列表,倒序保存所有中标金额
- unit_list.append(entity.money_unit)
- # if str(entity.label)=="0":
- if str(entity.label)=="0" and (entity.notes!='总投资' or float(entity.entity_text)<100000000):
- '''
- if p_entity>0:
- p_before = list_entity[p_entity-1]
- if p_before.entity_type=="money" and p_before.label==entity.label and p_before.entity_text==entity.entity_text and abs(entity.begin_index-p_before.end_index)<=2:
- p_entity -= 1
- continue
- '''
- packagePointer,_flag = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label),MAX_DIS=2,DIRECT="L")
- if packagePointer is None:
- packageName = "Project"
- else:
- packageName = packagePointer.entity_text
-
- if packageName=="Project":
- # if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
- # PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text))
- # if entity.values[entity.label]>on_value:
- if entity.values[entity.label]>max_prob-0.005: # 选择最大概率招标金额 2024/05/23 相差0.005尽量选前面的
- if entity.notes == '单价':
- PackDict["Project"]["unit_tendereeMoney"] = str(Decimal(entity.entity_text))
- else:
- PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text))
- PackDict["Project"]["tendereeMoneyUnit"] = entity.money_unit
- max_prob = entity.values[entity.label]
- else:
- if entity.notes == '单价':
- PackDict[packageName]["unit_tendereeMoney"] = str(Decimal(entity.entity_text))
- else:
- PackDict[packageName]["tendereeMoney"] = str(Decimal(entity.entity_text))
- PackDict[packageName]["tendereeMoneyUnit"] = entity.money_unit
- #add pointer_tendereeMoney
- packagePointer.pointer_tendereeMoney = entity
- p_entity -= 1
- '''包名与标段号链接'''
- l_main = []
- l_attn = []
- pack_num_main = 0
- name_num_main = 0
- pack_num_attn = 0
- name_num_attn = 0
- for entity in list_entity:
- if entity.entity_type in ['name', 'package']:
- if entity.in_attachment:
- l_attn.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end))
- if entity.entity_type == 'name':
- name_num_attn += 1
- else:
- pack_num_attn += 1
- else:
- l_main.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end))
- if entity.entity_type == 'name':
- name_num_main += 1
- else:
- pack_num_main += 1
- if name_num_main > 0 and pack_num_main > 0:
- l_main.sort(key=lambda x: [x[2],x[3]])
- # print('正文名称:',l_main)
- link_dic = {}
- i = 1
- pre_ty = l_main[0][0]
- while i < len(l_main):
- if l_main[i][0] != pre_ty:
- ty1, ent1, s1, b1, e1 = l_main[i-1]
- ty2, ent2, s2, b2, e2 = l_main[i]
- if ty1 == 'package':
- if ent1 not in link_dic:
- link_dic[ent1] = []
- if s1 == s2:
- dist = abs(b2 - b1)
- else:
- dist = len(list_sentence[s1].sentence_text) - b1
- for id in range(s1+1, s2):
- dist += len(list_sentence[id].sentence_text)
- dist += b2
- link_dic[ent1].append((s2-s1, dist, ent2))
- elif ty2 == 'package':
- if ent2 not in link_dic:
- link_dic[ent2] = []
- if s1 == s2:
- dist = abs(b2 - b1)
- else:
- dist = len(list_sentence[s1].sentence_text) - b1
- for id in range(s1+1, s2):
- dist += len(list_sentence[id].sentence_text)
- dist += b2
- link_dic[ent2].append((s2-s1, dist, ent1))
- pre_ty = l_main[i][0]
- i += 1
- for k, v in link_dic.items():
- v.sort(key=lambda x: [x[0], x[1]])
- # print('各包排序后项目名:', k, v)
- PackDict[k]["name"] = v[0][2]
- elif name_num_attn > 0 and pack_num_attn > 0:
- # print("附件名称:", l_attn)
- l_attn.sort(key=lambda x: [x[2],x[3]])
- link_dic = {}
- i = 1
- pre_ty = l_attn[0][0]
- while i < len(l_attn):
- if l_attn[i][0] != pre_ty:
- ty1, ent1, s1, b1, e1 = l_attn[i-1]
- ty2, ent2, s2, b2, e2 = l_attn[i]
- if ty1 == 'package':
- if ent1 not in link_dic:
- link_dic[ent1] = []
- if s1 == s2:
- dist = abs(b2 - b1)
- else:
- dist = len(list_sentence[s1].sentence_text) - b1
- for id in range(s1+1, s2):
- dist += len(list_sentence[id].sentence_text)
- dist += b2
- link_dic[ent1].append((s2-s1, dist, ent2))
- elif ty2 == 'package':
- if ent2 not in link_dic:
- link_dic[ent2] = []
- if s1 == s2:
- dist = abs(b2 - b1)
- else:
- dist = len(list_sentence[s1].sentence_text) - b1
- for id in range(s1+1, s2):
- dist += len(list_sentence[id].sentence_text)
- dist += b2
- link_dic[ent2].append((s2-s1, dist, ent1))
- pre_ty = l_attn[i][0]
- i += 1
- for k, v in link_dic.items():
- v.sort(key=lambda x: [x[0], x[1]])
- # print('各包排序后项目名:', k, v)
- PackDict[k]["name"] = v[0][2]
-
- #删除一个机构有多个角色的数据
- #删除重复人、概率不回传
- final_roleList = []
- list_pop = []
- set_tenderer_role = set()
- dict_pack_tenderer_money = dict()
- for pack in PackDict.keys():
- #删除无效包
- if PackDict[pack]["code"]=="" and PackDict[pack]["tendereeMoney"]==0 and len(PackDict[pack]["roleList"])==0:
- list_pop.append(pack)
- for i in range(len(PackDict[pack]["roleList"])):
- if PackDict[pack]["roleList"][i].role_name=="win_tenderer":
- if PackDict[pack]["roleList"][i].money==0:
- set_tenderer_role.add(PackDict[pack]["roleList"][i])
- dict_pack_tenderer_money[pack] = [PackDict[pack]["roleList"][i],set()]
- #找到包的中投标金额
- for _index in range(len(PackageList)):
- if "hit" in PackageList[_index]:
- for _hit in list(PackageList[_index]["hit"]):
- if len(_hit.split("-"))==3:
- _money = float(_hit.split("-")[1]) if _hit.split("-")[0]=="money" else None
- # 补充金额前新增负号‘-’导致错误的规则
- elif len(_hit.split("-"))==4:
- _money = float(_hit.split("-")[2]) if _hit.split("-")[0] == "money" else None
- else:
- _money = None
- if PackageList[_index]["name"] in dict_pack_tenderer_money and _money is not None:
- dict_pack_tenderer_money[PackageList[_index]["name"]][1].add(_money)
- #只找到一个中标人和中标金额
- if len(set_tenderer_money)==1 and len(set_tenderer_role)==1:
- list(set_tenderer_role)[0].money = list(set_tenderer_money)[0]
- list(set_tenderer_role)[0].money_unit = unit_list[0]
- # print('一个中标人一个金额:', list(set_tenderer_money)[0])
- #找到一个中标人和多个招标金额
- if len(set_tenderer_money)>1 and len(set_tenderer_role)==1:
- _maxMoney = list(set_tenderer_money)[0]
- _sumMoney = 0
- for _m in list(set_tenderer_money):
- _sumMoney += _m
- if _m>_maxMoney:
- _maxMoney = _m
- if _sumMoney/_maxMoney==2:
- list(set_tenderer_role)[0].money = _maxMoney
- # print('一人多金额分项合计 取最大金额:', _maxMoney)
- else:
- # list(set_tenderer_role)[0].money = _maxMoney
- if min(list_tenderer_money)>200000 and list_tenderer_money[-1]/min(list_tenderer_money)>9000:
- list(set_tenderer_role)[0].money = min(list_tenderer_money)
- list(set_tenderer_role)[0].money_unit = unit_list[list_tenderer_money.index(min(list_tenderer_money))]
- # print('一人多金额 且最小的大于20万第一个金额比最小金额大几千倍的最小中标金额:', min(list_tenderer_money))
- else:
- list(set_tenderer_role)[0].money = list_tenderer_money[-1] # 2021/7/16 修改 不是单价合计方式取第一个中标金额
- list(set_tenderer_role)[0].money_unit = unit_list[-1] # 金额单位
- # print('一人多金额 取第一个中标金额:', list_tenderer_money[-1])
- #每个包都只找到一个金额
- _flag_pack_money = True
- for k,v in dict_pack_tenderer_money.items():
- if len(v[1])!=1:
- _flag_pack_money = False
- if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()):
- for k,v in dict_pack_tenderer_money.items():
- if float(v[0].unit_price) < float(list(v[1])[0]): # 20241128 金额大于单价时才作链接金额
- v[0].money = list(v[1])[0]
- # 2021/7/16 #增加判断中标金额是否远大于招标金额逻辑
- for pack in PackDict.keys():
- for i in range(len(PackDict[pack]["roleList"])):
- if float(PackDict[pack]["tendereeMoney"]) > 0:
- # print('金额数据类型:',type(PackDict[pack]["roleList"][i].money))
- if float(PackDict[pack]["roleList"][i].money) >10000000 and \
- float(PackDict[pack]["roleList"][i].money)/float(PackDict[pack]["tendereeMoney"])>=1000:
- PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
- # print('招标金额校正中标金额')
- # 2022/04/01 #增加判断中标金额是否远小于招标金额逻辑,比例相差10000倍左右(中标金额“万”单位丢失或未识别)
- for pack in PackDict.keys():
- for i in range(len(PackDict[pack]["roleList"])):
- if float(PackDict[pack]["tendereeMoney"]) > 0 and float(PackDict[pack]["roleList"][i].money) > 0.:
- if float(PackDict[pack]["roleList"][i].money) < 1000 and \
- float(PackDict[pack]["tendereeMoney"])/float(PackDict[pack]["roleList"][i].money)>=9995 and \
- float(PackDict[pack]["tendereeMoney"])/float(PackDict[pack]["roleList"][i].money)<11000:
- PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) * 10000
- # 2021/7/19 #增加判断中标金额是否远大于第二三中标金额
- for pack in PackDict.keys():
- tmp_moneys = []
- for i in range(len(PackDict[pack]["roleList"])):
- if float(PackDict[pack]["roleList"][i].money) >100000:
- tmp_moneys.append(float(PackDict[pack]["roleList"][i].money))
- if len(tmp_moneys)>2 and max(tmp_moneys)/min(tmp_moneys)>1000:
- for i in range(len(PackDict[pack]["roleList"])):
- if float(PackDict[pack]["roleList"][i].money)/min(tmp_moneys)>1000:
- PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
- # print('通过其他中标人投标金额校正中标金额')
- for item in list_pop:
- PackDict.pop(item)
- # 公告中只有"招标人"且无"联系人"链接时
- if len(PackDict)==1:
- k = list(PackDict.keys())[0]
- tenderee_agency_role = [role for role in PackDict[k]["roleList"] if role.role_name in ['tenderee','agency']]
- if len(tenderee_agency_role)==1:
- exist_person = []
- exist_phone = []
- for role in PackDict[k]["roleList"]:
- for group in role.linklist:
- if group[0]:
- exist_person.append(group[0])
- if group[1]:
- exist_phone.append(group[1])
- if tenderee_agency_role[0].role_name == "tenderee":
- if not tenderee_agency_role[0].linklist:
- get_contacts = False
- if not get_contacts:
- # 根据大纲Outline类召回联系人
- for outline in list_outline:
- if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系",outline.outline_summary):
- for t_person in [p for p in temporary_list2 if p.entity_type=='person' and p.label==3]:
- if words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= words_num_dict[outline.sentence_begin_index] + outline.wordOffset_begin and words_num_dict[
- t_person.sentence_index] + t_person.wordOffset_end < words_num_dict[outline.sentence_end_index] + outline.wordOffset_end:
- if t_person.person_phone:
- _phone = [p.entity_text for p in t_person.person_phone]
- for _p in _phone:
- if t_person.entity_text not in exist_person and _p not in exist_phone:
- tenderee_agency_role[0].linklist.append((t_person.entity_text, _p))
- get_contacts = True
- break
- elif words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= \
- words_num_dict[outline.sentence_end_index] + outline.wordOffset_end:
- break
- if not get_contacts:
- sentence_phone = phone.findall(outline.outline_text)
- if sentence_phone:
- if sentence_phone[0] not in exist_phone:
- tenderee_agency_role[0].linklist.append(("", sentence_phone[0]))
- get_contacts = True
- break
- if not get_contacts:
- # 直接取文中倒数第一个联系人
- for _entity in temporary_list2[::-1]:
- if _entity.entity_type=='person' and _entity.label==3:
- if _entity.person_phone:
- _phone = [p.entity_text for p in _entity.person_phone]
- for _p in _phone:
- if _entity.entity_text not in exist_person and _p not in exist_phone:
- tenderee_agency_role[0].linklist.append((_entity.entity_text, _p))
- get_contacts = True
- break
- if not get_contacts:
- # 如果文中只有一个“phone”实体,则直接取为联系人电话
- if len(phone_entitys) == 1:
- if phone_entitys[0].entity_text not in exist_phone:
- tenderee_agency_role[0].linklist.append(("", phone_entitys[0].entity_text))
- get_contacts = True
- if not get_contacts:
- # 通过大纲Outline类直接取电话
- if len(new_split_list) > 1:
- for _start, _end in new_split_list:
- temp_sentence = _content[_start:_end]
- sentence_outline = temp_sentence.split(",::")[0]
- if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系", sentence_outline):
- sentence_phone = phone.findall(temp_sentence)
- if sentence_phone:
- if sentence_phone[0] in [ent.entity_text for ent in phone_entitys] and sentence_phone[0] not in exist_phone:
- tenderee_agency_role[0].linklist.append(("", sentence_phone[0]))
- get_contacts = True
- break
- if not get_contacts:
- # 通过正则提取句子段落进行提取电话
- contacts_person = "(?:联系人|联系方|联系方式|负责人|电话|联系电话)[::]?"
- tenderee_pattern = "(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主|业主单位)[^。]{0,5}"
- contact_pattern_list = [tenderee_pattern + contacts_person,
- "(?:采购[^。,]{0,2}项目|采购事项|招标)[^。,]{0,4}" + contacts_person,
- "(?:项目|采购)[^。,]{0,4}" + contacts_person,
- "(?:报名|报价|业务咨询|业务|投标咨询)[^。,]{0,4}" + contacts_person, ]
- for _pattern in contact_pattern_list:
- get_tenderee_contacts = False
- for regular_match in re.finditer(_pattern, _content):
- match_text = _content[regular_match.end():regular_match.end() + 40]
- match_text = match_text.split("。")[0]
- sentence_phone = phone.findall(match_text)
- if sentence_phone:
- if sentence_phone[0] not in exist_phone:
- tenderee_agency_role[0].linklist.append(("", sentence_phone[0]))
- get_tenderee_contacts = True
- break
- if get_tenderee_contacts:
- break
- # 如果同一个电话连到了不同的单位就直接去掉(2024-09-03 新增)
- get_phone_dict = dict()
- for k in PackDict.keys():
- for i in range(len(PackDict[k]["roleList"])):
- for item in PackDict[k]["roleList"][i].linklist:
- if item[1]:
- if item[1] not in get_phone_dict:
- get_phone_dict[item[1]] = set()
- get_phone_dict[item[1]].add(PackDict[k]["roleList"][i].entity_text)
- # print(get_phone_dict)
- remove_phone = []
- for phone,role_list in get_phone_dict.items():
- if len(role_list)>1:
- remove_phone.append(phone)
- for k in PackDict.keys():
- for i in range(len(PackDict[k]["roleList"])):
- remove_list = []
- for item in PackDict[k]["roleList"][i].linklist:
- if item[1] and item[1] in remove_phone:
- remove_list.append(item)
- for _item in remove_list:
- PackDict[k]["roleList"][i].linklist.remove(_item)
- for pack in PackDict.keys():
- for i in range(len(PackDict[pack]["roleList"])):
- PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
- return PackDict
- def initPackageAttr(RoleList,PackageSet,win_tenderer_set,tenderee_or_agency_set, main_body_pack):
- '''
- @summary: 根据拿到的roleList和packageSet初始化接口返回的数据
- '''
- packDict = dict()
- packDict["Project"] = {"code":"","tendereeMoney":0,"roleList":[], 'tendereeMoneyUnit':''}
- for item in list(PackageSet):
- packDict[item] = {"code":"","tendereeMoney":0,"roleList":[], 'tendereeMoneyUnit':''}
- packDict[item]['in_attachment'] = False if item in main_body_pack else True
- for item in RoleList:
- if packDict[item.packageName]["code"] =="":
- packDict[item.packageName]["code"] = item.packageCode
- # packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[]))
- # packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[])) #Role(角色名称,实体名称,角色阈值,金额,金额阈值,连接列表,金额单位)
- packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,item.role_prob,0,0.0,[],set(item.multi_winner)-win_tenderer_set-tenderee_or_agency_set)) #Role(角色名称,实体名称,角色阈值,金额,金额阈值,连接列表,多中标人)
- return packDict
-
- def getPackageRoleMoney(list_sentence,list_entity,list_outline):
- '''
- @param:
- list_sentence:文章的句子list
- list_entity:文章的实体list
- @return: 拿到文章的包-标段号-角色-实体名称-金额-联系人-联系电话
- '''
- # print("=1")
- theRole = getRoleList(list_sentence,list_entity)
- if not theRole:
- return []
- # RoleList,RoleSet,PackageList,PackageSet = theRole
- RoleList,RoleSet,PackageList,PackageSet,win_tenderer_set,tenderee_or_agency_set,main_body_pack = theRole
- '''
- for item in PackageList:
- # print(item)
- '''
- # PackDict = initPackageAttr(RoleList, PackageSet)
- PackDict = initPackageAttr(RoleList, PackageSet, win_tenderer_set,tenderee_or_agency_set,main_body_pack)
- PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_sentence, list_entity, list_outline)
- return PackDict
- def turnBidWay(bidway):
- if bidway in ("邀请招标","采购方式:邀请"):
- return "邀请招标"
- elif bidway in ("询价","询单","询比","采购方式:询价"):
- return "询价"
- elif bidway in ("竞谈","竞争性谈判","公开竞谈"):
- return "竞争性谈判"
- elif bidway in ("竞争性磋商","磋商"):
- return "竞争性磋商"
- elif bidway in ("竞价","竞标","电子竞价","以电子竞价","电子书面竞投"):
- return "竞价"
- elif bidway in ("公开招标","网上电子投标","网上招标","采购方式:公开","招标为其他"):
- return "公开招标"
- elif bidway in ("单一来源"):
- return "单一来源"
- elif bidway in ("比选"):
- return "比选"
- else:
- return "其他"
- def turnMoneySource(moneysource):
- result_list = []
- if re.search("自筹|业主筹集|筹资|自有",moneysource):
- result_list.append("自筹")
- if re.search("财政",moneysource) and not re.search("非财政",moneysource):
- result_list.append("财政资金")
- if re.search("拨款|补助|划拨|拨付|国拨|上级资金",moneysource):
- result_list.append("上级拨款")
- if re.search("社会资本|社会资金",moneysource):
- result_list.append("社会资本")
- if re.search("贷款|借款|借贷",moneysource):
- result_list.append("贷款资金")
- if re.search("债券|债|国债",moneysource):
- result_list.append("债券资金")
- if re.search("专项|项目资金",moneysource):
- result_list.append("项目专项资金")
- if re.search("配套",moneysource):
- result_list.append("配套资金")
- if re.search("外资",moneysource):
- result_list.append("外资")
- if re.search("国有资金|国企资金|国资|国家投资",moneysource):
- result_list.append("国有资金")
- if re.search("投资|融资",moneysource):
- result_list.append("投资资金")
- if re.search("预算(?<!外)|预算内",moneysource):
- result_list.append("预算内资金")
- if re.search("预算外",moneysource):
- result_list.append("预算外资金")
- result_list = sorted(result_list,key = lambda x:x)
- if len(result_list)>0 and len(result_list)<5:
- return ",".join(result_list)
- else:
- return "其他资金"
- my_time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
- from BiddingKG.dl.ratio.re_ratio import getUnifyNum
- def my_timeFormat(_time,page_time):
- if page_time:
- current_year = time.strftime("%Y",time.localtime(int(datetime.strptime(page_time, '%Y-%m-%d').timestamp())))
- else:
- current_year = time.strftime("%Y",time.localtime())
- all_match = re.finditer(my_time_format_pattern,_time)
- time_list = []
- for _match in all_match:
- if len(_match.group())>0:
- legal = True
- year = ""
- month = ""
- day = ""
- for k,v in _match.groupdict().items():
- if k=="year":
- year = v
- if k=="month":
- month = v
- if k=="day":
- day = v
- if year!="":
- if re.search("^\d+$", year):
- if len(year) == 2:
- year = "20" + year
- if int(year) - int(current_year) > 5 or int(year) - int(current_year) < -1:
- legal = False
- else:
- if int(year) - int(current_year)>10 or int(year) - int(current_year) < -1:
- legal = False
- else:
- _year = ""
- for word in year:
- if word == '0':
- _year += word
- else:
- _year += str(getDigitsDic(word))
- year = _year
- else:
- legal = False
- if month!="":
- if re.search("^\d+$", month):
- if int(month) > 12:
- legal = False
- else:
- month = int(getUnifyNum(month))
- if month >= 1 and month <= 12:
- month = str(month)
- else:
- legal = False
- else:
- legal = False
- if day!="":
- if re.search("^\d+$", day):
- if int(day) > 31:
- legal = False
- else:
- day = int(getUnifyNum(day))
- if day >= 1 and day <= 31:
- day = str(day)
- else:
- legal = False
- else:
- legal = False
- if not isValidDate(int(year),int(month),int(day)):
- legal = False
- if legal:
- # 数字字符格式化
- year = str(int(year))
- month = str(int(month))
- day = str(int(day))
- time_list.append("%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0")))
- return time_list
- def getTimeAttributes(list_entity,list_sentence,page_time):
- # from BiddingKG.dl.interface.htmlparser import get_childs
- # document_tree = parse_document.tree
- # new_document_tree = []
- # _data_i = -1
- # while _data_i < len(document_tree) - 1:
- # _data_i += 1
- # _data = document_tree[_data_i]
- # _type = _data["type"]
- # if _type == "sentence":
- # if _data["sentence_title"] is not None:
- # new_document_tree.append(_data)
- # document_tree = new_document_tree
- time_entitys = [i for i in list_entity if i.entity_type=='time']
- time_entitys = sorted(time_entitys,key=lambda x:(x.sentence_index, x.begin_index))
- list_sentence = sorted(list_sentence,key=lambda x:x.sentence_index)
- dict_time = {
- "time_release": [], # 1 发布时间
- "time_bidopen": [], # 2 开标时间
- "time_bidclose": [], # 3 截标时间
- 'time_bidstart': [], # 12 投标(开始)时间、响应文件接收(开始)时间
- 'time_publicityStart': [], # 4 公示开始时间(公示时间、公示期)
- 'time_publicityEnd': [], # 5 公示截止时间
- 'time_getFileStart': [], # 6 文件获取开始时间(文件获取时间)
- 'time_getFileEnd': [], # 7 文件获取截止时间
- 'time_registrationStart': [], # 8 报名开始时间(报名时间)
- 'time_registrationEnd': [], # 9 报名截止时间
- 'time_earnestMoneyStart': [], #10 保证金递交开始时间(保证金递交时间)
- 'time_earnestMoneyEnd': [] , # 11 保证金递交截止时间
- 'time_commencement':[] , #13 开工日期
- 'time_completion': [], # 14 竣工日期
- 'time_listingStart': [], # 15 挂牌开始日期(挂牌时间)
- 'time_listingEnd': [], # 16 挂牌结束日期、挂牌截止日期
- 'time_signContract': [], # 17 合同签订时间
- 'time_contractStart': [], # 18 合同开始时间
- 'time_contractEnd': [] # 19 合同结束时间
- }
- dict_time2label = {
- "time_release": 1, # 1 发布时间
- "time_bidopen": 2, # 2 开标时间
- "time_bidclose": 3, # 3 截标时间
- 'time_bidstart': 12, # 12 投标(开始)时间、响应文件接收(开始)时间
- 'time_publicityStart': 4, # 4 公示开始时间(公示时间、公示期)
- 'time_publicityEnd': 5, # 5 公示截止时间
- 'time_getFileStart': 6, # 6 文件获取开始时间(文件获取时间)
- 'time_getFileEnd': 7, # 7 文件获取截止时间
- 'time_registrationStart': 8, # 8 报名开始时间(报名时间)
- 'time_registrationEnd': 9, # 9 报名截止时间
- 'time_earnestMoneyStart': 10, # 10 保证金递交开始时间(保证金递交时间)
- 'time_earnestMoneyEnd': 11, # 11 保证金递交截止时间
- 'time_commencement': 13, # 13 开工日期
- 'time_completion': 14, # 14 竣工日期
- 'time_listingStart': 15, # 15 挂牌开始日期(挂牌时间)
- 'time_listingEnd': 16, # 16 挂牌结束日期、挂牌截止日期
- 'time_signContract': 17, # 17 合同签订时间
- 'time_contractStart': 18, # 18 合同开始时间
- 'time_contractEnd': 19 # 19 合同结束时间
- }
- last_sentence_index = 0
- last_time_type = ""
- last_time_index = {
- 'time_bidstart':"time_bidclose",
- 'time_publicityStart':"time_publicityEnd",
- 'time_getFileStart':"time_getFileEnd",
- 'time_registrationStart':"time_registrationEnd",
- 'time_earnestMoneyStart':"time_earnestMoneyEnd",
- 'time_commencement':"time_completion",
- 'time_listingStart':"time_listingEnd",
- 'time_contractStart':"time_contractEnd"
- }
- time_entitys = [[_entity,my_timeFormat(_entity.entity_text,page_time)] for _entity in time_entitys]
- time_entitys = [item for item in time_entitys if item[1]]
- for entity_idx in range(len(time_entitys)):
- entity = time_entitys[entity_idx][0]
- extract_time = time_entitys[entity_idx][1]
- sentence_text = list_sentence[entity.sentence_index].sentence_text
- previous_entity = time_entitys[entity_idx-1][0] if entity_idx!=0 else None
- previous_extract_time = time_entitys[entity_idx-1][1] if entity_idx!=0 else None
- next_entity = time_entitys[entity_idx+1][0] if entity_idx!=len(time_entitys)-1 else None
- next_extract_time = time_entitys[entity_idx+1][1] if entity_idx!=len(time_entitys)-1 else None
- # 实体有效上下文
- entity_context_begin = previous_entity.wordOffset_end if previous_entity and previous_entity.sentence_index==entity.sentence_index else 0
- entity_context_end = next_entity.wordOffset_begin if next_entity and next_entity.sentence_index==entity.sentence_index else len(sentence_text)
- if entity.sentence_index!=last_sentence_index:
- # sentence_index 不同句子重置last_time_type
- last_time_type = ""
- entity_left = sentence_text[max(entity_context_begin, entity.wordOffset_begin - 2):entity.wordOffset_begin]
- entity_left2 = sentence_text[max(entity_context_begin, entity.wordOffset_begin - 10):entity.wordOffset_begin]
- entity_left3 = sentence_text[max(entity_context_begin, entity.wordOffset_begin - 30):entity.wordOffset_begin]
- entity_right = sentence_text[entity.wordOffset_end:min(entity.wordOffset_end + 3,entity_context_end)]
- entity_right2 = sentence_text[entity.wordOffset_end:entity_context_end]
- entity_right2 = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",'',entity_right2)[:60] # 去除网址
- # print(entity.entity_text,entity_right2)
- label_prob = entity.values[entity.label]
- entity_text = entity.entity_text
- in_attachment = entity.in_attachment
- # extract_time = my_timeFormat(entity_text,page_time)
- # print(entity_text,entity_left2)
- if extract_time:
- definite_time_list = []
- t = re.compile("(北京时间)?(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[::时点](?P<half_hour>半)?(?P<minute>\d{1,2})?[::分]?(?P<second>\d{2})?秒?")
- _entity_text = re.sub(" (?=[^\d])|(?<=[^\d]) ","",entity_text)
- _entity_text_len = len(_entity_text)
- _entity_text = _entity_text + sentence_text[entity.wordOffset_end:entity.wordOffset_end+20]
- t_in_word_num = len(re.findall(t,_entity_text))
- # t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,re.sub(" (?=[^\d])|(?<=[^\d]) ","",sentence_text[entity.wordOffset_end:]))
- begin_index = 0
- for _num in range(t_in_word_num):
- if begin_index> _entity_text_len + 8:
- break
- t_in_word = re.search(t, _entity_text[begin_index:])
- if t_in_word:
- if _num==0 and t_in_word.start() > _entity_text_len + 8:
- break
- begin_index = t_in_word.end()
- # print('t_in_word',entity_text,t_in_word.groupdict())
- day = t_in_word.groupdict().get('day',"")
- hour = t_in_word.groupdict().get('hour',"")
- half_hour = t_in_word.groupdict().get('half_hour',"")
- minute = t_in_word.groupdict().get('minute',"")
- second = t_in_word.groupdict().get('second',"")
- if hour:
- if day=='下午' and int(hour)<12:
- hour = str(int(hour)+12)
- if int(hour)>24:
- continue
- else:
- hour = "00"
- if not minute:
- if half_hour:
- minute = "30"
- else:
- minute = "00"
- if int(minute)>60:
- continue
- if not second:
- second = "00"
- if int(second)>60:
- continue
- definite_time = "%s:%s:%s"%(hour.rjust(2,"0"),minute.rjust(2,"0"),second.rjust(2,"0"))
- # print(definite_time)
- definite_time_list.append(definite_time)
- min_len = min(len(extract_time),len(definite_time_list))
- for i in range(min_len):
- if definite_time_list[i] == "24:00:00": # 修正不规范时间表述
- definite_time_list[i] = "23:59:59"
- if definite_time_list[i] != "00:00:00":
- extract_time[i] = extract_time[i] + " " + definite_time_list[i]
- if extract_time:
- # 时间变更prob优化
- if re.search("原",entity_left2):
- last_index = 0
- for item in re.finditer("原",entity_left2):
- last_index = item.start() + 1
- label_prob = label_prob - 0.2 * last_index / len(entity_left2)
- # print('prob优化',label_prob,extract_time)
- elif re.search("改正|更正|修正|更改|延期",entity_left2):
- new_label = dict_time2label.get(last_time_type,None)
- if new_label and entity.label==0:
- entity.label = new_label
- label_prob = 1
- # 优化多个并列的时间,如:开标时间和截标时间,截标时间和报名结束时间
- if entity.label in [2,3,9]:
- if entity.label==2 and re.search("截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止|文件.{,2}([递提]交|接收)",entity_left3):
- dict_time['time_bidclose'].append((extract_time[0], label_prob-0.1, in_attachment))
- if entity.label==3 and re.search("开标|(评审|比选).{,2}(?:开始)?(时间|日期)|选取.{,2}(时间|日期)",entity_left3):
- dict_time['time_bidopen'].append((extract_time[0], label_prob-0.1, in_attachment))
- if entity.label==3 and re.search("报名",entity_left3):
- dict_time['time_registrationEnd'].append((extract_time[0], 0.5, in_attachment))
- if entity.label==3 and re.search("获取",entity_left3[-20:]):
- dict_time['time_getFileEnd'].append((extract_time[0], 0.45, in_attachment))
- if entity.label==9 and re.search("截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止|文件.{,2}([递提]交|接收)",entity_left3):
- dict_time['time_bidclose'].append((extract_time[0], label_prob-0.1, in_attachment))
- if entity.label in [11, 3]:
- if entity.label==11 and re.search("文件.{,2}([递提]交|接收)|截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
- dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
- if entity.label==3 and re.search("保证金.{,2}(接受|收取)|(接受|收取).{,2}保证金",entity_left3):
- dict_time['time_earnestMoneyEnd'].append((extract_time[0], 0.5, in_attachment))
- if entity.label in [6, 7]:
- if re.search("文件.{,2}([递提]交|接收)|截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
- dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
- if entity.label==0:
- if re.search("文件.{,2}([递提]交|接收)|截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
- if len(extract_time)>=2:
- dict_time['time_bidstart'].append((extract_time[0], 0.45, in_attachment))
- dict_time['time_bidclose'].append((extract_time[1], 0.45, in_attachment))
- else:
- dict_time['time_bidclose'].append((extract_time[0], 0.45, in_attachment))
- if entity.label==6:
- # "文件获取时间"和"报名时间"并列
- if re.search("报名",entity_left3):
- if len(extract_time)==1:
- dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
- else:
- dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
- dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment))
- # 获取文件/报名/报价 时间补充(上下文表达过长无法通过模型识别)
- # if entity.label == 0:
- # if re.search("(获取|领取|售卖|出售|购买|下载).{,4}(招标|投标|采购)?(文件|标书)|(文件|标书).{,4}(获取|售卖|出售|发售|购买)", entity_left3):
- # if len(extract_time)==2:
- # dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment))
- # dict_time['time_getFileEnd'].append((extract_time[1], 0.51, in_attachment))
- # else:
- # if next_entity and next_entity.sentence_index==entity.sentence_index:
- # mid_text = sentence_text[entity.wordOffset_end:next_entity.wordOffset_begin]
- # if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(next_extract_time)==1:
- # dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment))
- # dict_time['time_getFileEnd'].append((next_extract_time[0], 0.51, in_attachment))
- # if not dict_time['time_getFileEnd']:
- # if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]):
- # dict_time['time_getFileEnd'].append((extract_time[0], 0.51, in_attachment))
- # elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]):
- # dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment))
- # if re.search("(进行|在线|线下|线上|网上).{,2}报名|报名.{,2}(开始)?(时间|日期)", entity_left3):
- # if len(extract_time)==2:
- # dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
- # dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment))
- # else:
- # if next_entity and next_entity.sentence_index==entity.sentence_index:
- # mid_text = sentence_text[entity.wordOffset_end:next_entity.wordOffset_begin]
- # if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(next_extract_time)==1:
- # dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
- # dict_time['time_registrationEnd'].append((next_extract_time[0], 0.51, in_attachment))
- # if not dict_time['time_registrationEnd']:
- # if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]):
- # dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment))
- # elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]):
- # dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
- #
- # if re.search("(获取|售卖|出售|购买).{,4}(招标|投标|采购)?(文件|标书)|(文件|标书).{,4}(获取|售卖|出售|发售|购买)", entity_right2):
- # if len(extract_time)==2:
- # dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment))
- # dict_time['time_getFileEnd'].append((extract_time[1], 0.51, in_attachment))
- # else:
- # if previous_entity and previous_entity.sentence_index==entity.sentence_index:
- # mid_text = sentence_text[previous_entity.wordOffset_end:entity.wordOffset_begin]
- # if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(previous_extract_time)==1:
- # dict_time['time_getFileStart'].append((previous_extract_time[0], 0.51, in_attachment))
- # dict_time['time_getFileEnd'].append((extract_time[0], 0.51, in_attachment))
- # if not dict_time['time_getFileEnd']:
- # if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]):
- # dict_time['time_getFileEnd'].append((extract_time[0], 0.51, in_attachment))
- # elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]):
- # dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment))
- # if re.search("(进行|在线|线下).{,2}报名", entity_right2):
- # if len(extract_time) == 2:
- # dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
- # dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment))
- # else:
- # if previous_entity and previous_entity.sentence_index==entity.sentence_index:
- # mid_text = sentence_text[previous_entity.wordOffset_end:entity.wordOffset_begin]
- # if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(previous_extract_time)==1:
- # dict_time['time_registrationStart'].append((previous_extract_time[0], 0.51, in_attachment))
- # dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment))
- # if not dict_time['time_registrationEnd']:
- # if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]):
- # dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment))
- # elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]):
- # dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
- # if re.search("(进行|开始).{,4}(报价|投标|竞价)", entity_right2):
- # if len(extract_time) == 2:
- # dict_time['time_bidstart'].append((extract_time[0], 0.51, in_attachment))
- # # dict_time['time_bidclose'].append((extract_time[1], 0.51, in_attachment))
- # 补充公告末尾处的发布时间
- if entity.label==0:
- if entity.is_tail:
- entity.label = 1
- entity.values[1] = 0.5
- dict_time['time_release'].append((extract_time[0], 0.5, in_attachment))
- # 2022/12/12 新增挂牌时间正则
- if re.search("挂牌.{,4}(?:时间|日期)",entity_left2):
- if re.search("挂牌.{,4}(?:时间|日期)",entity_left2).end()>len(entity_left2)/2:
- if len(extract_time) == 1:
- if re.search("挂牌.?(开始|起始).?(?:时间|日期)",entity_left2):
- dict_time['time_listingStart'].append((extract_time[0], 0.5, in_attachment))
- last_time_type = 'time_listingStart'
- elif re.search("挂牌.?(截[止至]|结束).?(?:时间|日期)",entity_left2):
- dict_time['time_listingEnd'].append((extract_time[0], 0.5, in_attachment))
- last_time_type = 'time_listingEnd'
- elif re.search("挂牌.?(?:时间|日期)",entity_left2):
- if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
- dict_time['time_listingEnd'].append((extract_time[0], 0.5, in_attachment))
- last_time_type = 'time_listingEnd'
- else:
- dict_time['time_listingStart'].append((extract_time[0], 0.5, in_attachment))
- last_time_type = 'time_listingStart'
- else:
- dict_time['time_listingStart'].append((extract_time[0], 0.5, in_attachment))
- dict_time['time_listingEnd'].append((extract_time[1], 0.5, in_attachment))
- last_time_type = ''
- last_sentence_index = entity.sentence_index
- continue
- # 2023/9/13 新增合同相关时间
- if re.search("合同|服务|履[约行]", entity_left3[-15:]):
- if len(extract_time) == 1:
- if re.search("(合同.{,2}签[订定署].{,2}|签[订定署].{,2}合同.{,2})(?:时间|日期)|合同签[订定署].{,1}$", entity_left2):
- dict_time['time_signContract'].append((extract_time[0], 0.5, in_attachment))
- last_time_type = 'time_signContract'
- last_sentence_index = entity.sentence_index
- continue
- elif re.search("(合同|服务|履约|(合同|服务)履行).{,4}(?:起始|开始)(?:时间|日期)", entity_left3[-15:]):
- dict_time['time_contractStart'].append((extract_time[0], 0.55, in_attachment))
- last_time_type = 'time_contractStart'
- last_sentence_index = entity.sentence_index
- continue
- elif re.search("(合同|服务|履约).{,2}(?:完成|截止|结束)(?:时间|日期|时限)", entity_left2):
- dict_time['time_contractEnd'].append((extract_time[0], 0.55, in_attachment))
- last_time_type = 'time_contractEnd'
- last_sentence_index = entity.sentence_index
- continue
- elif re.search("(?:合同|服务|履约|(合同|服务)履行)(?:期限?|有效期)|(?:服务|履约|(合同|服务)履行)(?:时间|日期|周期)|服务[时年]限|合同周期", entity_left2):
- if re.search("到|至|截[至止]",entity_left) or re.search("前|止|截止",entity_right) or re.search("前",entity_text[-2:]):
- dict_time['time_contractEnd'].append((extract_time[0], 0.5, in_attachment))
- last_time_type = 'time_contractEnd'
- else:
- dict_time['time_contractStart'].append((extract_time[0], 0.5, in_attachment))
- last_time_type = 'time_contractStart'
- last_sentence_index = entity.sentence_index
- continue
- else:
- if re.search("(?:合同|服务|履约|(合同|服务)履行)(?:期限?|有效期)|(?:服务|履约|(合同|服务)履行)(?:时间|日期|周期)|服务[时年]限|合同周期", entity_left2):
- # 排除开始和借宿时间一样的错误模板,例:“履约期限:2023年02月15日至2023年02月15日”
- if extract_time[0]!=extract_time[1]:
- dict_time['time_contractStart'].append((extract_time[0], 0.6, in_attachment))
- dict_time['time_contractEnd'].append((extract_time[1], 0.6, in_attachment))
- last_time_type = ''
- last_sentence_index = entity.sentence_index
- continue
- # 服务期限表达补充
- if entity.label==0:
- re_service = '合同期限|工期/交货期/服务期|工期\(交货期\)|合格工期|服务期限|工期' \
- '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期限' \
- '|合格工期|计划工期\(服务期\)|服务期|服务,期|交货\(完工\)(时间|日期)|交付\(服务、完工\)(时间|日期)' \
- '|交货(时间|日期)|工期承诺|(服务|合同|施工|实施|工程|设计)的?(年限|期限|周期|期:)' \
- '|服务期限为|计划工期|工期要求|服务期限|服务期' \
- '|投标工期|设计工期|合格服务周期|总工期|服务(时间|日期)(范围)?|流转期限|维护期限|服务时限|交货期' \
- '|完成(时间|日期)|服务期限|中标工期|项目周期|期限要求|供货期|合同履行日期|计划的?周期' \
- '|履约期限|合同约定完成时限|合同完成日期|承诺完成日期' \
- '|合同起始日起|合同履约期|履约截止日期|承包期限|合同完成日期' \
- '|服务期间|服务履行期|委托(管理)?期限|履约期限、地点等简要信息'
- if len(extract_time)==2:
- if re.search(re_service,entity_left2) or re.search("履约期限、地点等简要信息",entity_left3[-20:]):
- dict_time['time_contractStart'].append((extract_time[0], 0.5, in_attachment))
- dict_time['time_contractEnd'].append((extract_time[1], 0.5, in_attachment))
- last_time_type = ''
- # 报价/投标时间补充(规则补充)
- if entity.label == 0:
- if re.search("[报竞]价.{,2}(开始|起始).{,2}(时间|日期)",entity_left2):
- entity.label = 12
- label_prob = 0.8
- elif re.search("[报竞]价.{,2}起止.{,2}(时间|日期)",entity_left2):
- entity.label = 12
- label_prob = 0.6
- elif re.search("响应.{,2}文件([递提]交|接收).{,2}(时间|日期)[::]|([递提]交|接收).{,2}响应.{,2}文件.{,2}(时间|日期)[::]",entity_left2):
- entity.label = 3
- label_prob = 0.501
- elif re.search("响应.{,2}文件([递提]交|接收).{,2}(时间|日期)|([递提]交|接收).{,2}响应.{,2}文件.{,2}(时间|日期)",entity_left2) and not re.search("截[止至]",entity_left2):
- entity.label = 12
- label_prob = 0.51
- elif re.search("[报竞]价.{,2}截[止至].{,2}(时间|日期)",entity_left2):
- entity.label = 3
- label_prob = 0.8
- elif re.search("(竞价|报价).?(时间|日期)",entity_left2):
- entity.label = 12
- label_prob = 0.51
- elif re.search("(竞价|报价).?(时间|日期)",entity_left3) and re.search("参与|报价|有意",entity_left2):
- entity.label = 12
- label_prob = 0.501
- # 文档结构补充
- # if entity.label == 0:
- # re_registration = re.compile("报名|(文件|标书)[\u4e00-\u9fa5、]{,4}(获取|出售|售卖|购买|下载)|"
- # "(获取|出售|售卖|购买|下载)[\u4e00-\u9fa5、]{,4}(文件|标书)")
- # _data_i = -1
- # while _data_i < len(document_tree) - 1:
- # _data_i += 1
- # _data = document_tree[_data_i]
- # _type = _data["type"]
- # _text = _data["text"].strip()
- # childs = get_childs([_data])
- # last_child = childs[-1]
- # if entity.sentence_index>=_data.sentence_index and entity.wordOffset_begin>=_data.wordOffset_begin and
- # ():
- # if re.search(re_registration, re.split("[::;;,]", _text)[0][:20]) is not None:
- #
- # content_text = ""
- # for c in childs:
- # content_text += c["text"] + ""
- # print('concat_text', content_text)
- if re.search(",(完成|截止|结束)(时间|日期)", entity_left2[-8:]) and entity.label==0:
- if entity.sentence_index == last_sentence_index:
- time_type = last_time_index.get(last_time_type)
- if time_type:
- dict_time[time_type].append((extract_time[0], 0.5 + label_prob / 10,in_attachment))
- last_time_type = ""
- last_sentence_index = entity.sentence_index
- continue
- if re.search("至|到|[日\d][-—]$|[~~]", entity_left):
- if entity.sentence_index == last_sentence_index:
- time_type = last_time_index.get(last_time_type)
- if time_type:
- dict_time[time_type].append((extract_time[0], 0.5 + label_prob / 10,in_attachment))
- last_time_type = ""
- last_sentence_index = entity.sentence_index
- continue
- if entity.label!=0:
- if entity.label==1 and label_prob>0.5:
- dict_time['time_release'].append((extract_time[0],label_prob,in_attachment))
- last_time_type = 'time_release'
- elif entity.label==2 and label_prob>0.5:
- dict_time['time_bidopen'].append((extract_time[0],label_prob,in_attachment))
- last_time_type = 'time_bidopen'
- elif entity.label==3 and label_prob>0.5:
- if len(extract_time)==1:
- dict_time['time_bidclose'].append((extract_time[0],label_prob,in_attachment))
- last_time_type = 'time_bidclose'
- elif len(extract_time)==2:
- dict_time['time_bidstart'].append((extract_time[0], 0.6, in_attachment))
- dict_time['time_bidclose'].append((extract_time[1], label_prob, in_attachment))
- last_time_type = 'time_bidclose'
- elif entity.label==12 and label_prob>0.5:
- if len(extract_time)==1:
- if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
- dict_time['time_bidclose'].append((extract_time[0], label_prob,in_attachment))
- last_time_type = 'time_bidclose'
- else:
- dict_time['time_bidstart'].append((extract_time[0], label_prob,in_attachment))
- last_time_type = 'time_bidstart'
- else:
- dict_time['time_bidstart'].append((extract_time[0],label_prob,in_attachment))
- dict_time['time_bidclose'].append((extract_time[1],label_prob,in_attachment))
- last_time_type = ''
- elif entity.label==4 and label_prob>0.5:
- if len(extract_time)==1:
- if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
- dict_time['time_publicityEnd'].append((extract_time[0], label_prob,in_attachment))
- last_time_type = 'time_publicityEnd'
- else:
- dict_time['time_publicityStart'].append((extract_time[0], label_prob,in_attachment))
- last_time_type = 'time_publicityStart'
- else:
- dict_time['time_publicityStart'].append((extract_time[0],label_prob,in_attachment))
- dict_time['time_publicityEnd'].append((extract_time[1],label_prob,in_attachment))
- last_time_type = ''
- elif entity.label==5 and label_prob>0.5:
- if len(extract_time)==1:
- dict_time['time_publicityEnd'].append((extract_time[0], label_prob,in_attachment))
- last_time_type = 'time_publicityEnd'
- else:
- dict_time['time_publicityStart'].append((extract_time[0],label_prob,in_attachment))
- dict_time['time_publicityEnd'].append((extract_time[1],label_prob,in_attachment))
- last_time_type = ''
- elif entity.label==6 and label_prob>0.5:
- if len(extract_time)==1:
- if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
- dict_time['time_getFileEnd'].append((extract_time[0], label_prob,in_attachment))
- last_time_type = 'time_getFileEnd'
- else:
- dict_time['time_getFileStart'].append((extract_time[0], label_prob,in_attachment))
- last_time_type = 'time_getFileStart'
- else:
- dict_time['time_getFileStart'].append((extract_time[0],label_prob,in_attachment))
- dict_time['time_getFileEnd'].append((extract_time[1],label_prob,in_attachment))
- last_time_type = ''
- elif entity.label==7 and label_prob>0.5:
- if len(extract_time)==1:
- dict_time['time_getFileEnd'].append((extract_time[0], label_prob,in_attachment))
- last_time_type = 'time_getFileEnd'
- else:
- dict_time['time_getFileStart'].append((extract_time[0],label_prob,in_attachment))
- dict_time['time_getFileEnd'].append((extract_time[1],label_prob,in_attachment))
- last_time_type = ''
- elif entity.label==8 and label_prob>0.5:
- if len(extract_time)==1:
- if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
- dict_time['time_registrationEnd'].append((extract_time[0], label_prob,in_attachment))
- last_time_type = 'time_registrationEnd'
- else:
- dict_time['time_registrationStart'].append((extract_time[0], label_prob,in_attachment))
- last_time_type = 'time_registrationStart'
- else:
- dict_time['time_registrationStart'].append((extract_time[0],label_prob,in_attachment))
- dict_time['time_registrationEnd'].append((extract_time[1],label_prob,in_attachment))
- last_time_type = ''
- elif entity.label==9 and label_prob>0.5:
- if len(extract_time)==1:
- dict_time['time_registrationEnd'].append((extract_time[0], label_prob,in_attachment))
- last_time_type = 'time_registrationEnd'
- else:
- dict_time['time_registrationStart'].append((extract_time[0],label_prob,in_attachment))
- dict_time['time_registrationEnd'].append((extract_time[1],label_prob,in_attachment))
- last_time_type = ''
- elif entity.label==10 and label_prob>0.5:
- if len(extract_time)==1:
- if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
- dict_time['time_earnestMoneyEnd'].append((extract_time[0], label_prob,in_attachment))
- last_time_type = 'time_earnestMoneyEnd'
- else:
- dict_time['time_earnestMoneyStart'].append((extract_time[0], label_prob,in_attachment))
- last_time_type = 'time_earnestMoneyStart'
- else:
- dict_time['time_earnestMoneyStart'].append((extract_time[0],label_prob,in_attachment))
- dict_time['time_earnestMoneyEnd'].append((extract_time[1],label_prob,in_attachment))
- last_time_type = ''
- elif entity.label==11 and label_prob>0.5:
- if len(extract_time)==1:
- dict_time['time_earnestMoneyEnd'].append((extract_time[0], label_prob,in_attachment))
- last_time_type = 'time_earnestMoneyEnd'
- else:
- dict_time['time_earnestMoneyStart'].append((extract_time[0],label_prob,in_attachment))
- dict_time['time_earnestMoneyEnd'].append((extract_time[1],label_prob,in_attachment))
- last_time_type = ''
- elif entity.label==13 and label_prob>0.5:
- if len(extract_time)==1:
- if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
- dict_time['time_completion'].append((extract_time[0], label_prob,in_attachment))
- last_time_type = 'time_completion'
- else:
- dict_time['time_commencement'].append((extract_time[0], label_prob,in_attachment))
- last_time_type = 'time_commencement'
- else:
- dict_time['time_commencement'].append((extract_time[0],label_prob,in_attachment))
- dict_time['time_completion'].append((extract_time[1],label_prob,in_attachment))
- last_time_type = ''
- elif entity.label==14 and label_prob>0.5:
- if len(extract_time)==1:
- dict_time['time_completion'].append((extract_time[0], label_prob,in_attachment))
- last_time_type = 'time_completion'
- else:
- dict_time['time_commencement'].append((extract_time[0],label_prob,in_attachment))
- dict_time['time_completion'].append((extract_time[1],label_prob,in_attachment))
- last_time_type = ''
- else:
- last_time_type = ""
- else:
- last_time_type = ""
- else:
- last_time_type = ""
- last_sentence_index = entity.sentence_index
- # 通过文档分析树形结构补充部分时间实体
- def add_time_by_parseDocument(dict_time,parse_document):
- from BiddingKG.dl.interface.htmlparser import get_childs
- document_tree = parse_document.tree
- # if not dict_time['time_getFileStart'] or not dict_time['time_getFileEnd']:
- # time_pattern = re.compile("")
- concat_text_list = []
- if not dict_time['time_registrationStart'] or not dict_time['time_registrationEnd']:
- re_registration = re.compile("报名|(文件|标书)[\u4e00-\u9fa5、]{,4}(获取|出售|售卖|购买|下载)|"
- "(获取|出售|售卖|购买|下载)[\u4e00-\u9fa5、]{,4}(文件|标书)")
- _data_i = -1
- while _data_i < len(document_tree) - 1:
- _data_i += 1
- _data = document_tree[_data_i]
- _type = _data["type"]
- _text = _data["text"].strip()
- # print(_data.keys())
- if _type == "sentence":
- print('_text:',_text,_data["sentence_title"])
- if _data["sentence_title"] is not None:
- print("aptitude_pattern", _text)
- print(_data['sentence_index'],_data['wordOffset_begin'],_data['wordOffset_end'])
- if re.search(re_registration, re.split("[::;;。]",_text)[0][:15]) is not None:
- childs = get_childs([_data])
- concat_text = ""
- for c in childs:
- concat_text += c["text"] + ""
- print('concat_text',concat_text)
- concat_text_list.append(concat_text)
- _data_i += len(childs)-1
- # if _type == "table":
- # list_table = _data["list_table"]
- # parent_title = _data["parent_title"]
- # if list_table is not None:
- # for line in list_table[:2]:
- # for cell_i in range(len(line)):
- # cell = line[cell_i]
- # cell_text = cell[0]
- # if len(cell_text) > 120 and re.search(re_registration, cell_text) is not None:
- # concat_text += cell_text + "\n"
- print('_text',concat_text_list)
- for text in concat_text_list:
- time_list = re.finditer(my_time_format_pattern,text)
- time_list = [(i,my_timeFormat(i.group(),page_time)) for i in time_list]
- for time_idx in range(len(time_list)):
- _time = time_list[time_idx][0]
- extract_time = time_list[time_idx][1]
- entity_left = text[:_time.start()]
- entity_left = re.split("[。;;!??]",entity_left)[-1]
- # entity_left2 = sentence_text[
- # max(entity_context_begin, entity.wordOffset_begin - 10):entity.wordOffset_begin]
- # entity_left3 = sentence_text[
- # max(entity_context_begin, entity.wordOffset_begin - 30):entity.wordOffset_begin]
- entity_right = text[_time.end():]
- entity_right = re.split("[。;;!??]",entity_right)[0]
- # entity_right2 = sentence_text[entity.wordOffset_end:entity_context_end]
- entity_right2 = re.sub(r"(http[s]?://)?(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F])){6,}",
- '', entity_right)[:60] # 去除网址
- print('entity_right2',entity_right2)
- if re.search("(进行|在线|线下).{,2}报名", entity_right2):
- print('报名text',entity_right2)
- if len(extract_time) == 2:
- dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
- dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment))
- else:
- if previous_entity and previous_entity.sentence_index==entity.sentence_index:
- mid_text = sentence_text[previous_entity.wordOffset_end:entity.wordOffset_begin]
- if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(previous_extract_time)==1:
- dict_time['time_registrationStart'].append((previous_extract_time[0], 0.51, in_attachment))
- dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment))
- if not dict_time['time_registrationEnd']:
- if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]):
- dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment))
- elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]):
- dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
- return dict_time
- # dict_time = add_time_by_parseDocument(dict_time,parse_document)
- # print(dict_time)
- result_dict = dict((key,"") for key in dict_time.keys())
- for time_type,value in dict_time.items():
- list_time = dict_time[time_type]
- if list_time:
- for in_attachment in [False,True]:
- _list_time = [_time for _time in list_time if _time[2]==in_attachment]
- if _list_time:
- _list_time.sort(key=lambda x:(x[1],len(x[0])),reverse=True) # sort_key: label_prob,时间文本长度(优先有具体时分秒的)
- if in_attachment==True and len(result_dict[time_type])>0:
- break
- result_dict[time_type] = _list_time[0][0]
- # result_dict 纠错
- if not result_dict['time_bidclose']:
- if result_dict['time_bidstart']: # 无截标时间,投标开始和开标时间一样
- if result_dict['time_bidstart'][:10] in result_dict['time_bidopen']:
- result_dict['time_bidstart'] = ""
- result_dict['time_bidclose'] = result_dict['time_bidopen']
- if not result_dict['time_bidclose']:
- if result_dict['time_getFileEnd']: # 无截标时间,获取文件截止时间和开标时间一样
- if result_dict['time_getFileEnd'][:10] in result_dict['time_bidopen']:
- result_dict['time_bidclose'] = result_dict['time_bidopen']
- else:
- if result_dict['time_bidopen']: # 截标时间 和 开标时间 时分秒互补
- if len(result_dict['time_bidclose'])<len(result_dict['time_bidopen']) and result_dict['time_bidclose'] in result_dict['time_bidopen']:
- result_dict['time_bidclose'] = result_dict['time_bidopen']
- elif len(result_dict['time_bidclose'])>len(result_dict['time_bidopen']) and result_dict['time_bidopen'] in result_dict['time_bidclose']:
- result_dict['time_bidopen'] = result_dict['time_bidclose']
- return result_dict
- def get_days_between(day1,day2,get_abs=0):
- '''
- :param day1: 较小日期
- :param day2: 较大日期
- :param get_abs: 是否取绝对值
- :return: 天数差
- '''
- # 将日期字符串转换为datetime对象
- date1 = datetime.strptime(day1, '%Y-%m-%d')
- date2 = datetime.strptime(day2, '%Y-%m-%d')
- # 计算日期差
- delta = date2 - date1
- # 获取天数差
- days_difference = delta.days
- if get_abs:
- return abs(days_difference)
- else:
- return days_difference
- def extract_serviceTime(service_time,page_time):
- pattern1 = re.compile("\d{4}[年\-./]\d{1,2}[月\-./]\d{1,2}日?")
- pattern2 = re.compile("\d+(?:\.\d+)?[((]?个?[^\d]?[^\d]?(?:日|天|周年|整年|学?年|月|周|日历[天日]|工作[天日])")
- pattern3 = re.compile("\d{4}[年\-./]\d{1,2}月?")
- pattern4 = re.compile("(?:日|天|周年|年|月|周|日历[天日]|工作[天日]|星期)[^\d]{1,3}\d+(?:\.\d+)?")
- DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
- "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9,
- "两":2, '貮': 2}
- def get_month_days(year, month):
- # calendar.monthrange(year, month)返回一个元组,其中第一个元素是月份的第一天是星期几(0-6为星期一到星期日),
- # 第二个元素是该月的天数。
- _, last_day = calendar.monthrange(year, month)
- return last_day
- def get_num(text):
- CN_UNIT = {'十': 10,'拾': 10,'百': 100,
- '佰': 100,'千': 1000,'仟': 1000}
- regex = re.compile(r'[〇一二三四五六七八九零壹贰叁肆伍陆柒捌玖貮两十拾百佰千仟]+')
- text = regex.search(text)
- if text:
- text = text.group()
- else:
- return ""
- result = 0
- result_list = []
- unit = 0
- control = 0
- for i, d in enumerate(text):
- if d in '零百佰千仟' and i == 0:
- return ""
- if d in DigitsDic:
- result += DigitsDic[d]
- elif d in CN_UNIT:
- if unit == 0:
- unit_1 = CN_UNIT[d]
- # 这里的处理主要是考虑到类似于二十三亿五千万这种数
- if result == 0:
- result = CN_UNIT[d]
- else:
- result *= CN_UNIT[d]
- unit = CN_UNIT[d]
- result_1 = result
- elif unit > CN_UNIT[d]:
- result -= DigitsDic[text[i - 1]]
- result += DigitsDic[text[i - 1]] * CN_UNIT[d]
- unit = CN_UNIT[d]
- elif unit <= CN_UNIT[d]:
- if (CN_UNIT[d] < unit_1) and (len(result_list) == control):
- result_list.append(result_1)
- result = (result - result_1) * CN_UNIT[d]
- control += 1
- else:
- result *= CN_UNIT[d]
- unit = CN_UNIT[d]
- if len(result_list) == control:
- unit_1 = unit
- result_1 = result
- else:
- return ""
- return sum(result_list) + result
- serviceTime_dict = {"service_start": "", "service_end": "", "service_days": 0}
- re_num = re.findall(r'[〇一二三四五六七八九零壹贰叁肆伍陆柒捌玖貮两十拾百佰千仟]+',service_time)
- for _num in re_num:
- if not re.search("[十拾百佰千仟]",_num):
- num = ""
- for word in _num:
- num += str(DigitsDic.get(word,word))
- service_time = service_time.replace(_num,num,1)
- else:
- num = str(get_num(_num))
- service_time = service_time.replace(_num,num,1)
- end_time = ""
- service_days = 0
- re_page_time = re.search("20\d{2}-\d{2}-\d{2}", page_time)
- page_time = re_page_time.group() if re_page_time else "2000-01-01" # page_time为空时默认值为2000-01-01
- if re.search(pattern1,service_time):
- # end_time = re.findall(pattern1,service_time)[-1]
- time_list = []
- for _time in re.findall(pattern1,service_time):
- _time = re.sub("日","",_time)
- _time = re.sub("[年月./]","-",_time)
- _year,_month,_day = _time.split("-")
- _month = int(_month)
- _day = int(_day)
- _year = int(_year)
- if _year>2050 or _year<=2000 or _month>12 or _month<=0 or _day<=0 or _day>31:
- service_days = 0
- else:
- if isValidDate(_year,_month,_day):
- _time = str(_year)+'-'+str(_month)+'-'+str(_day)
- _time = _time.split("-")[0] + '-' + _time.split("-")[1].rjust(2,"0") + '-' + _time.split("-")[2].rjust(2,"0")
- time_list.append(_time)
- if len(time_list)>=2:
- if get_days_between(page_time,time_list[1])>1 and get_days_between(time_list[0],time_list[1])>0:
- serviceTime_dict['service_end'] = time_list[1]
- serviceTime_dict['service_start'] = time_list[0]
- elif len(time_list)==1:
- if get_days_between(page_time, time_list[0]) > 1:
- serviceTime_dict['service_end'] = time_list[0]
- # service_days = (time.mktime(time.strptime(end_time,"%Y-%m-%d"))-page_timestamp)/(24*60*60)
- elif re.search(pattern3,service_time):
- time_list = []
- # end_time = re.findall(pattern3,service_time)[-1]
- for _time in re.findall(pattern3,service_time):
- _time = re.sub("月","",_time)
- _time = re.sub("[年./]","-",_time)
- _year,_month = _time.split("-")
- _day = 0
- _month = int(_month)
- _year = int(_year)
- if _year>2050 or _year<=2000 or _month>12 or _month<=0:
- service_days = 0
- else:
- _day = get_month_days(_year,_month)
- if isValidDate(_year, _month, _day):
- _time = str(_year)+'-'+str(_month)+'-'+str(_day)
- _time = _time.split("-")[0] + '-' + _time.split("-")[1].rjust(2,"0") + '-' + _time.split("-")[2].rjust(2,"0")
- time_list.append(_time)
- if len(time_list) >= 2:
- if get_days_between(page_time, time_list[1]) > 1 and get_days_between(time_list[0], time_list[1]) > 0:
- serviceTime_dict['service_end'] = time_list[1]
- serviceTime_dict['service_start'] = time_list[0]
- elif len(time_list)==1:
- if get_days_between(page_time, time_list[0]) > 1:
- serviceTime_dict['service_end'] = time_list[0]
- # service_days = (time.mktime(time.strptime(end_time,"%Y-%m-%d"))-page_timestamp)/(24*60*60)
- elif re.search(pattern2,service_time) or re.search(pattern4,service_time):
- for pattern in [pattern2,pattern4]:
- unit = 1
- match = re.findall(pattern,service_time)
- if len(set(match))==1:
- match_text = match[0]
- if "月" in match_text:
- unit = 30
- elif "年" in match_text:
- unit = 365
- elif "周" in match_text or "星期" in match_text:
- unit = 7
- match_num = float(re.search("\d+",match_text).group())
- # 数字能被365整除,单位更正为天
- if int(match_num)%365==0:
- unit = 1
- if unit==365:
- if match_num>10:#单位为'年'时,排除数字过大的
- match_num = 0
- elif unit==30:
- if match_num>60:#单位为'月'时,排除数字过大的
- match_num = 0
- elif unit==1:
- if match_num>4000:#单位为'日'时,排除数字过大的
- match_num = 0
- service_days = int(match_num * unit)
- if service_days % 360==0:
- service_days = service_days / 360 * 365
- elif service_days % 180==0 and service_days % 360!=0:
- service_days = service_days // 360 * 365 + 180
- service_days = int(service_days)
- if service_days <= 1 and service_days > 4000:
- service_days = 0
- if service_days>3:
- # service_days = str(service_days) + "天"
- serviceTime_dict['service_days'] = service_days
- break
- elif "半年" in service_time:
- service_days = 180
- # service_days = str(service_days) + "天"
- serviceTime_dict['service_days'] = service_days
- if serviceTime_dict['service_start'] and serviceTime_dict['service_end']:
- service_days = get_days_between(serviceTime_dict['service_start'],serviceTime_dict['service_end'])
- serviceTime_dict['service_days'] = service_days
- return serviceTime_dict
- def getServiceTime():
- pass
- def getOtherAttributes(list_entity,page_time,prem,channel_dic):
- dict_other = {"moneysource":"",
- "person_review":[],
- "serviceTime":"",
- "product":[],
- "total_tendereeMoney":0,
- "total_tendereeMoneyUnit":''}
- list_serviceTime = []
- last_moneysource_prob = 0
- for entity in list_entity:
- if entity.entity_type == 'bidway':
- dict_other["bidway"] = turnBidWay(entity.entity_text)
- elif entity.entity_type=='moneysource':
- if dict_other["moneysource"] and entity.in_attachment:
- continue
- if not dict_other["moneysource"]:
- dict_other["moneysource"] = entity.entity_text
- last_moneysource_prob = entity.prob
- elif entity.prob>last_moneysource_prob:
- dict_other["moneysource"] = entity.entity_text
- last_moneysource_prob = entity.prob
- elif entity.entity_type=='serviceTime':
- # print(entity.entity_text)
- # if list_serviceTime and entity.in_attachment:
- # continue
- if re.search("[^之]日|天|年|月|周|星期", entity.entity_text) or re.search("\d{4}[-./]\d{1,2}", entity.entity_text):
- list_serviceTime.append(entity)
- elif entity.entity_type=="person" and entity.label ==4 and entity.entity_text not in dict_other["person_review"]: # 20240624评审专家去重
- dict_other["person_review"].append(entity.entity_text)
- elif entity.entity_type=='product' and entity.entity_text not in dict_other["product"]: #顺序去重保留
- dict_other["product"].append(entity.entity_text)
- elif entity.entity_type=='money' and entity.notes=='总投资' and float(dict_other["total_tendereeMoney"])<float(entity.entity_text):
- dict_other["total_tendereeMoney"] = str(Decimal(entity.entity_text))
- dict_other["total_tendereeMoneyUnit"] = entity.money_unit
- time_contractEnd = prem[0].get("time_contractEnd","")[:10]
- time_contractStart = prem[0].get("time_contractStart","")[:10]
- serviceTime_dict = {"service_start":"", "service_end":"", "service_days": 0}
- if time_contractEnd:
- serviceTime_dict['service_end'] = time_contractEnd
- if time_contractStart:
- if get_days_between(time_contractStart,time_contractEnd)>0:
- serviceTime_dict['service_start'] = time_contractStart
- # print([i.entity_text for i in list_serviceTime])
- if list_serviceTime and not serviceTime_dict['service_end']:
- list_serviceTime_inAtt = [serviceTime for serviceTime in list_serviceTime if serviceTime.in_attachment==1]
- list_serviceTime = [serviceTime for serviceTime in list_serviceTime if serviceTime.in_attachment==0]
- error_serviceTime = []
- for list_time in [list_serviceTime,list_serviceTime_inAtt]:
- # if not dict_other["serviceTime"]:
- if not serviceTime_dict['service_end'] and not serviceTime_dict['service_days']:
- list_time.sort(key=lambda x: (x.prob,-x.sentence_index,-x.begin_index), reverse=True)
- for _serviceTime in list_time:
- # 优先取具体时间(20XX年x月x日-20XX年x月x日)
- if re.search("20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾;;]{,4}20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?",_serviceTime.entity_text):
- _extract_time = my_timeFormat(_serviceTime.entity_text,page_time)
- if _extract_time and len(_extract_time)==2:
- # 排除开始和结束时间一样的错误模板,例:“履约期限:2023年02月15日至2023年02月15日”
- if _extract_time[0]!=_extract_time[1]:
- # dict_other["serviceTime"] = _serviceTime.entity_text
- # extract_time = extract_serviceTime(_serviceTime.entity_text)
- # if extract_time['service_end']:
- serviceTime_dict['service_start'] = _extract_time[0]
- serviceTime_dict['service_end'] = _extract_time[1]
- break
- else:
- error_serviceTime.append(_serviceTime.entity_text)
- # if not dict_other["serviceTime"]:
- if not serviceTime_dict['service_end']:
- for _serviceTime in list_time:
- # 优先取具体时间(20XX年x月-20XX年x月)
- if re.search("20\d{2}[年/.\-]\d{1,2}月?[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾;;]{,3}20\d{2}[年/.\-]\d{1,2}月?", _serviceTime.entity_text):
- # dict_other["serviceTime"] = _serviceTime.entity_text
- extract_time = extract_serviceTime(_serviceTime.entity_text,page_time)
- if extract_time['service_end']:
- serviceTime_dict = extract_time
- break
- # if not dict_other["serviceTime"]:
- if not serviceTime_dict['service_end']:
- for _serviceTime in list_time:
- # 优先取具体时间(20XX年x月x日)
- if re.search("20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?",_serviceTime.entity_text):
- if _serviceTime.entity_text not in error_serviceTime:
- # dict_other["serviceTime"] = _serviceTime.entity_text
- extract_time = extract_serviceTime(_serviceTime.entity_text,page_time)
- if extract_time['service_end']:
- serviceTime_dict = extract_time
- break
- # if not dict_other["serviceTime"]:
- if not serviceTime_dict['service_end'] and not serviceTime_dict['service_days']:
- for _serviceTime in list_time:
- if _serviceTime.entity_text not in error_serviceTime:
- # dict_other["serviceTime"] = _serviceTime.entity_text
- extract_time = extract_serviceTime(_serviceTime.entity_text,page_time)
- if extract_time['service_end'] or extract_time['service_days']:
- serviceTime_dict = extract_time
- break
- if serviceTime_dict['service_start'] and serviceTime_dict['service_end']:
- service_days = get_days_between(serviceTime_dict['service_start'],serviceTime_dict['service_end'])
- serviceTime_dict['service_days'] = service_days
- dict_other["serviceTime"] = serviceTime_dict
- if not time_contractEnd and channel_dic['docchannel']['docchannel']=='合同公告': # 用serviceTime补充合同开始结束时间,公告类型为合同公告
- if serviceTime_dict['service_start'] and serviceTime_dict['service_end']:
- prem[0]["time_contractStart"] = serviceTime_dict['service_start']
- prem[0]["time_contractEnd"] = serviceTime_dict['service_end']
- if dict_other['moneysource']:
- dict_other['moneysource'] = turnMoneySource(dict_other['moneysource'])
- # dict_other["product"] = list(set(dict_other["product"])) # 已在添加时 顺序去重保留
- return dict_other
- def getMoneyRange(RoleList):
- pass
- def getProjectContacts(list_entity, list_sentence):
- # project_contacts "项目联系人"提取
- temp_person_entitys = [ent for ent in list_entity if ent.entity_type=='person' and ent.label in [1,2,3]]
- temp_person_entitys = sorted(temp_person_entitys,key=lambda x:(x.sentence_index,x.wordOffset_begin))
- project_contacts_patterns = ['项目.?联系[人方]','项目.?联系.?方式', '项目.?负责人']
- project_contacts_patterns_prob = [0.9, 0.85, 0.8]
- project_contacts_patterns_res = []
- for ent in temp_person_entitys:
- sent_idx = ent.sentence_index
- word_begin = ent.wordOffset_begin
- # word_end = ent.wordOffset_end
- in_att = ent.in_attachment
- if word_begin >= 5: # > len('项目联系人')
- left_text = list_sentence[sent_idx].sentence_text[max(0, word_begin - 15):word_begin]
- # print('left_text', left_text)
- for pattern, prob in zip(project_contacts_patterns, project_contacts_patterns_prob):
- if re.search(pattern, left_text):
- project_contacts_patterns_res.append([ent, sent_idx, word_begin, prob if not in_att else prob / 2])
- project_contacts_patterns_res = sorted(project_contacts_patterns_res, key=lambda x: (x[3], -x[1], -x[2]),
- reverse=True)
- # print('project_contacts_patterns_res', project_contacts_patterns_res)
- project_contacts_list = []
- phone_set = set()
- have_in_text = False
- if project_contacts_patterns_res:
- for item in project_contacts_patterns_res:
- in_att = item[0].in_attachment
- contacts_person = item[0].entity_text
- contacts_phone = item[0].person_phone[0].entity_text if item[0].person_phone else ""
- if contacts_phone:
- if not in_att:
- have_in_text = True
- if in_att and have_in_text: # 正文已提取,则排除附件的
- break
- if contacts_phone not in phone_set:
- phone_set.add(contacts_phone)
- project_contacts_list.append([contacts_person,contacts_phone])
- return {'project_contacts':project_contacts_list}
- def getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time):
- '''
- @param:
- list_sentence:所有文章的句子list
- list_entity:所有文章的实体list
- @return:list of dict which include文章的包-角色-实体名称-金额-联系人-联系电话
- '''
- result = []
- for list_sentence,list_entity,list_article,list_outline in zip(list_sentences,list_entitys,list_articles,list_outlines):
- RoleList = getPackageRoleMoney(list_sentence,list_entity,list_outline)
- result.append(dict({"prem": RoleList, "docid": list_article.doc_id},
- **getTimeAttributes(list_entity, list_sentence,page_time),
- **getProjectContacts(list_entity, list_sentence),
- **{"fingerprint": list_article.fingerprint,
- "match_enterprise": list_article.match_enterprise,
- "match_enterprise_type": list_article.match_enterprise_type,
- "process_time": getCurrent_date(),
- "attachmentTypes": list_article.attachmentTypes, "bidway": list_article.bidway}))
- # result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity),**getTimeAttributes(list_entity,list_sentence),
- # **{"fingerprint":list_article.fingerprint,"match_enterprise":list_article.match_enterprise,
- # "match_enterprise_type":list_article.match_enterprise_type,"process_time":getCurrent_date(),
- # "attachmentTypes":list_article.attachmentTypes, "bidway": list_article.bidway}))
- return result
- def correct_rolemoney(prem, total_product_money, list_articles): # 2022/9/26修改为 中标金额小于表格单价数量合计总金额十分之一时替换
- '''
- 最后根据表格提取的单价数量合计对比更新中标金额,或中标金额为0全文只有一个总价或合计时,作为中标金额
- :param prem: 列表
- :param total_product_money: 表格统计金额
- :param list_articles: 文章对象
- :return:
- '''
- if '##attachment##' in list_articles[0].content:
- content, attachment = list_articles[0].content.split('##attachment##')
- if len(content) < 200:
- content += attachment
- else:
- content = list_articles[0].content
- if len(re.findall('win_tenderer|second_tenderer|third_tenderer', str(prem[0]['prem'])))==1 and re.search('(中标|成交|合同|投标))?(总?金额|[报总]?价):', content) == None: # 只有一个中标角色且没有明确中标金额表达的
- if total_product_money>0 and total_product_money<5000000000:
- for value in prem[0]['prem'].values():
- ree_money = float(value['tendereeMoney'])
- for l in value['roleList']:
- try:
- # if l[0] == 'win_tenderer' and float(l[2])<total_product_money:
- # l[2] = total_product_money
- # log('修改中标金额为所有产品总金额')
- # if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money']) == 0 and float(l["role_money"]['money'])<total_product_money/10:
- if l["role_name"] == 'win_tenderer' and (float(l["role_money"]['money']) == 0 or (float(l["role_money"]['money'])<ree_money/2 and float(l["role_money"]['money'])<total_product_money<ree_money)): # 改为小于一半招标金额或为0时替换为合计金额
- l["role_money"]['money'] = total_product_money
- # print('修改中标金额为所有产品总金额')
- except Exception as e:
- print('表格产品价格修正中标价格报错:%s'%e)
- elif (len(re.findall('合计', content)) == 1 or len(re.findall('总价', content)) == 1):
- ser = re.search('(?P<header>合计((万?元))?:)(?P<money>[\d,.]+(万?元)?)', content) if len(re.findall('合计', content)) == 1 else re.search('(?P<header>总价((万?元))?:)(?P<money>[\d,.]+(万?元)?)', content)
- if ser:
- money_text = ser.group('money')
- header = ser.group('header')
- money, money_unit = money_process(money_text, header)
- if 100<money<8000000:
- for value in prem[0]['prem'].values():
- for l in value['roleList']:
- try: # 如果原中标金额为0 或 金额小于合计金额0.1倍且正文没中标金额关键词 替换为 合计金额
- if l["role_name"] == 'win_tenderer' and (float(l["role_money"]['money'])==0 or (float(l["role_money"]['money']) < money / 10 and re.search('(中标|成交|合同)(总?金额|[单报总]?价)', content) == None)):
- l["role_money"]['money'] = str(money)
- l["role_money"]['money_unit'] = money_unit
- # print('修改中标金额为总价或合计金额')
- except Exception as e:
- print('修正中标价格报错:%s' % e)
- def limit_maximum_amount(dic, list_entity):
- '''
- 通过关键词、行业、公告类别等设置最高最低角色金额
- :param dic: 最终返回所有字段结果字典
- :param list_entity: 实体列表
- :return:
- '''
- indu_amount = {
- '计算机设备': 200000000,
- '办公设备': 100000000,
- '家具用具': 500000000,
- '办公消耗用品及类似物品': 100000000,
- '日杂用品': 100000000,
- '餐饮业': 1000000000,
- '物业管理': 1000000000,
- '工程技术与设计服务': 1000000000,
- '工程评价服务': 100000000,
- '其他工程服务': 100000000,
- '工程监理服务': 100000000,
- '工程造价服务': 100000000,
- '会计、审计及税务服务': 100000000,
- '其他专业咨询与调查': 100000000
- }
- title = dic.get('doctitle_refine', '')
- name = dic.get('name', '')
- product = ','.join(dic.get('product', []))
- text = "%s;%s;%s"%(title, name, product)
- doctype = dic.get('docchannel', {}).get('doctype', '') # 公告类型
- industry = dic['industry'].get('class_name', '')
- category = dic['industry'].get('class', '') # 行业门类
- moneys = [float(it.entity_text) for it in list_entity if it.entity_type=='money' and re.search('^\d+(\.\d+)?', it.entity_text) and 5000<float(it.entity_text)<5000000]
- maximum_amount = 10000000000
- minximum_amount = 100
- if re.search('监理|造价咨询|设计|勘察|招标代理中介服务|工程审计', text) and re.search('施工|总承包|ppp|PPP', text.replace('施工监理', '监理'))==None:
- # print('监理设计等限额')
- maximum_amount = 1000000000
- minximum_amount = 200
- elif re.search('施工|总承包|ppp|PPP|公路|道路|桥梁|铁路|土地使用权|地块|棚改|征地拆迁|棚户区改造|土地征收|建设用地|社会保险', text) or category in ['金融业', '建筑业'] or doctype == '土地矿产':
- # print('施工、铁路等限额')
- if industry in ['科研、医疗、教育用房', '住宅、商业用房', '场馆、站港用房','工业、生产用房','专业施工']:
- maximum_amount = 20000000000
- minximum_amount = 200
- elif industry in ['修缮工程', '电气安装', '管道和设备安装', '建筑装饰和装修业', '建筑物拆除和场地准备活动']:
- maximum_amount = 10000000000
- minximum_amount = 100
- else:
- maximum_amount = 50000000000
- minximum_amount = 500
- elif re.search('(办公|体育)(用品|设备|器材)|耗材|打印机|复印机|打印纸|粉盒|墨粉|复印纸|网上超市|电子卖场|家电|配电箱采购|配件|备件', text):
- # print('商品采购限额')
- maximum_amount = 80000000
- minximum_amount = 10
- elif re.search('修理|维修|(安保|保安|安全|保洁|物业|后勤|管理|代理|中介|印刷)服务', text):
- # print('维修限额')
- maximum_amount = 50000000
- elif re.search('(速递|快递|邮政|邮寄)(物流)?服务', text):
- # print('快递限额')
- maximum_amount = 80000000
- minximum_amount = 10
- elif industry in indu_amount:
- maximum_amount = indu_amount[industry]
- # print('maximum_amount:', maximum_amount)
- for value in dic['prem'].values():
- for l in value['roleList']:
- if l["role_name"] in ['win_tenderer', 'second_tenderer', 'third_tenderer']:
- # date = float(re.search('(\d+)天', l.get('serviceTime', '')).group(1)) if re.search('(\d+)天', l.get('serviceTime', '')) else 0
- serviceTime_dict = l.get('serviceTime', dict())
- serviceTime_dict = serviceTime_dict if serviceTime_dict else dict()
- date = serviceTime_dict.get("service_days",0)
- if 0 < date < 180 and float(l["role_money"]['money']) > 10000000000: # 工期小于180天且金额大于百亿的,错误
- l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000)
- # print('工期纠正百亿以上金额 ')
- elif float(l["role_money"]['money']) > maximum_amount:
- flag = 1
- for money in moneys:
- if float(l["role_money"]['money'])/money == 10000 and l['role_money']['money_unit'] == '万元':
- l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000)
- # print('万倍关系纠正连接金额')
- flag = 0
- break
- if flag and l["role_money"]['money_unit'] == '万元' or re.search('^\d{11,}(\.0)?$', str(l["role_money"]['money'])):
- l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000)
- # print('行业限额纠正连接金额')
- elif industry in ['餐饮业', '物业管理'] and maximum_amount == indu_amount[industry]:
- l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000)
- # elif flag and l["role_money"]['money_unit'] == '元':
- # l["role_money"]['money'] = 0
- elif 0<float(l["role_money"]['money']) < minximum_amount:
- if l["role_money"]['money_unit'] == '元' and re.search('^\d{1,2}\.\d{4,6}$', str(l["role_money"]['money'])):
- # print('单位元小金额且格式类似万元的乘以万倍')
- l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) * 10000)
- # else: # 20241011 取消小于最低金额改为0 避免小金额不提取 例:520248605
- # # print('中标金额小于限额:%d元 去除' % minximum_amount)
- # l["role_money"]['money'] = 0
- if float(value['tendereeMoney']) > maximum_amount:
- flag = 1
- for money in moneys:
- if float(value['tendereeMoney'])/money == 10000 and l['role_money']['money_unit'] == '万元':
- value['tendereeMoney'] = str(Decimal(value['tendereeMoney'])/10000)
- # print('万倍关系纠正连接金额')
- flag = 0
- break
- if (flag and value['tendereeMoneyUnit'] == '万元' or re.search('^\d{11,}(\.0)?$', str(value['tendereeMoney']))) and float(value['tendereeMoney']) > maximum_amount*100: #2024/5/23 改为单位万元且超过限额100倍才除一万,避免不合理纠正 比如 174255856 项目(系统)一亿变一万
- value['tendereeMoney'] = str(Decimal(value['tendereeMoney']) / 10000)
- # print('行业限额纠正连接金额')
- elif industry in ['餐饮业', '物业管理'] and maximum_amount == indu_amount[industry]:
- value['tendereeMoney'] = str(Decimal(value['tendereeMoney']) / 10000)
- # elif flag and value['tendereeMoneyUnit'] == '元':
- # value['tendereeMoney'] = 0
- elif 0<float(value['tendereeMoney']) < minximum_amount:
- if value['tendereeMoneyUnit'] == '元' and re.search('^\d{1,2}\.\d{4,6}$', str(value['tendereeMoney'])):
- # print('单位元小金额且格式类似万元的乘以万倍')
- value['tendereeMoney'] = str(Decimal(value['tendereeMoney']) * 10000)
- # else: # 20241011 取消小于最低金额改为0 避免小金额不提取 例:520248605
- # # print('招标金额小于限额:%d元 去除' % minximum_amount)
- # value['tendereeMoney'] = 0
- def limit_maximum_amount_backup(prem, industry):
- indu = industry['industry'].get('class_name', '')
- indu_amount = {
- '计算机设备': 200000000,
- '办公设备': 100000000,
- '家具用具': 500000000,
- '办公消耗用品及类似物品': 100000000,
- '日杂用品': 100000000,
- '餐饮业': 1000000000,
- '物业管理': 1000000000,
- '工程技术与设计服务': 1000000000,
- '工程评价服务': 100000000,
- '其他工程服务': 100000000,
- '工程监理服务': 100000000,
- '工程造价服务': 100000000,
- '会计、审计及税务服务': 100000000,
- }
- if indu in indu_amount:
- maximum_amount = indu_amount[indu]
- try:
- for value in prem[0]['prem'].values():
- for l in value['roleList']:
- if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money']) > maximum_amount:
- if indu in ['餐饮业', '物业管理']:
- l["role_money"]['money'] = str(float(l["role_money"]['money'])/10000)
- elif l["role_money"]['money_unit'] == '万元':
- l["role_money"]['money'] = str(float(l["role_money"]['money'])/10000)
- if float(value['tendereeMoney']) > maximum_amount:
- if indu in ['餐饮业', '物业管理']:
- value['tendereeMoney'] = float(value['tendereeMoney'])/10000
- elif value['tendereeMoneyUnit'] == '万元':
- value['tendereeMoney'] = float(value['tendereeMoney']) / 10000
- except Exception as e:
- print('行业分类限制最高金额抛出异常:%s' % e)
- def get_win_joint(prem, list_entitys, list_sentences, list_articles):
- '''
- 获取联合体信息, 添加到prem
- :param prem:
- :param list_entitys:
- :param list_sentences:
- :param list_articles:
- :return:
- '''
- try:
- if 'win_tenderer' in str(prem[0]['prem']) and re.search('联合(体|方|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|(联合(体|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|[,;]成:|(成[),]|与[^,。]{6,100}联合体', list_articles[0].content):
- sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
- for v in prem[0]['prem'].values():
- for d in v['roleList']:
- if d.get('role_name', '') == 'win_tenderer':
- winner = d.get('role_text')
- join_l = [winner]
- for list_entity in list_entitys:
- for i in range(len(list_entity)-1):
- _entity = list_entity[i]
- b = _entity.wordOffset_begin
- e = _entity.wordOffset_end
- if _entity.entity_type in ['org', 'company'] and _entity.label==2\
- and _entity.entity_text==winner:
- s = sentences[_entity.sentence_index].sentence_text
- find_joint = 0 # 是否包含联合体
- for j in range(i+1, len(list_entity)):
- behind_entity = list_entity[j]
- b2 = behind_entity.wordOffset_begin
- e2 = behind_entity.wordOffset_end
- if _entity.sentence_index == behind_entity.sentence_index and behind_entity.entity_type in ['org', 'company'] \
- and b2-e<13 and re.search('联合(体|方|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[,;]成:|(成)$', s[e:b2]) or \
- re.search('(联合(体|方|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|^(成[),]$', s[e2:e2+10]) and behind_entity.label in [2, 5]:
- join_l.append(behind_entity.entity_text)
- b = b2
- e = e2
- find_joint = 1
- elif (find_joint or re.search('与[^,。]{6,100}联合体', list_articles[0].content)) and behind_entity.entity_type in ['org', 'company'] and s[e:b2] in ['与',';','、','&',',','/','//'] and (len(s)==e2 or s[e2] in [';','、','&',',','/','//', '。', ')'] or s[e2:e2+3]=='联合体'):
- join_l.append(behind_entity.entity_text)
- b = b2
- e = e2
- elif e == e2: # 修复重复实体导致中断情况
- continue
- else:
- break
- if len(join_l)>1:
- d['win_tenderer_joint'] = ','.join(set(join_l))
- # behind_entity = list_entity[i + 1]
- # if _entity.sentence_index== behind_entity.sentence_index and _entity.entity_type in ['org', 'company'] and _entity.label==2\
- # and _entity.entity_text==winner and behind_entity.entity_type in ['org', 'company'] and behind_entity.label==5:
- # s = sentences[_entity.sentence_index].sentence_text
- # b = _entity.wordOffset_begin
- # e = _entity.wordOffset_end
- # b2 = behind_entity.wordOffset_begin
- # e2 = behind_entity.wordOffset_end
- # if re.search('(联合体)', s[e2:e2+6]) and b2-e<3:
- # print('联合体:', s[max(0, b-10):e2+10])
- # d['win_tenderer_joint'] = '%s,%s'%(_entity.entity_text, behind_entity.entity_text)
- # break
- # elif re.search('(联合体((牵头|主办)(人|方|单位)|主体)|牵头(人|方|单位))|(联合体)?成员:|特殊普通合伙:', s[e:b2]) and b2-e<10:
- # d['win_tenderer_joint'] = '%s,%s' % (_entity.entity_text, behind_entity.entity_text)
- # print('联合体:', s[max(0, b - 10):e2 + 10])
- # break
- except Exception as e:
- print('获取联合体抛出异常', e)
- def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences, all_winner=False):
- '''
- 获取多中标人及正文、附件所有金额,多中标人multi_winner写入prem,返回金额列表
- :param channel_dic:
- :param prem:
- :param list_entitys:
- :param list_sentences:
- :return:
- '''
- def add_multi_winner(pack_l, winner_l):
- if len(prem[0]['prem']) > 1 and len(set([it[0] for it in pack_l])) > 1: # 多标段多中标人处理
- pk_dic = {}
- for ent in winner_l:
- for i in range(len(pack_l)):
- pk, s1, b1, _ = pack_l[i]
- if ent[1] < s1 or ent[1] == s1 and ent[2] < b1:
- break
- elif (ent[1] > s1 or ent[1] == s1 and ent[2] > b1):
- if i < len(pack_l) - 1:
- pk2, s2, b2, _ = pack_l[i + 1]
- if (ent[1] < s2 or ent[1] == s2 and ent[2] < b2):
- if pk not in pk_dic:
- pk_dic[pk] = set()
- pk_dic[pk].add(ent[0])
- else:
- continue
- else:
- if pk not in pk_dic:
- pk_dic[pk] = set()
- pk_dic[pk].add(ent[0])
- else:
- continue
- for pk, multi_winner in pk_dic.items():
- multi_winner = multi_winner - tenderee_or_agency
- if len(multi_winner) < 2:
- continue
- for k, v in prem[0]['prem'].items():
- if pk == k:
- for d in v['roleList']:
- if d.get('role_name', '') == 'win_tenderer':
- if d.get('role_text', '') in multi_winner and 'multi_winner' not in d:
- d['multi_winner'] = ','.join(set(multi_winner))
- elif 0 < len(prem[0]['prem']) < 3: # 修复 单包多中标人 例:285780273
- multi_winner = set([it[0] for it in winner_l]) - tenderee_or_agency
- if len(multi_winner) > 1:
- for v in prem[0]['prem'].values():
- for d in v['roleList']:
- if d.get('role_name', '') == 'win_tenderer':
- if d.get('role_text', '') in multi_winner and 'multi_winner' not in d:
- d['multi_winner'] = ','.join(set(multi_winner))
- break
- moneys = []
- moneys_attachment = []
- if channel_dic['docchannel']['life_docchannel'] in ['中标信息','候选人公示','合同公告'] and 'win_tenderer' in str(prem):
- sentences = sorted(list_sentences[0], key=lambda x: x.sentence_index)
- finalists = [] # 入围供应商
- multi_winner_l = [] # 保存中标人名称列表
- tenderee_or_agency = set()
- package_l = []
- i = 0
- while i < len(list_entitys[0])-1:
- ent = list_entitys[0][i]
- b_idx_fr = ent.wordOffset_begin
- e_idx_fr = ent.wordOffset_end
- i += 1
- if ent.entity_type in ['money']:
- money = float(ent.entity_text)
- if ent.in_attachment:
- moneys_attachment.append(money)
- else:
- moneys.append(money)
- elif ent.entity_type in ['package']:
- package_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
- elif ent.entity_type in ['org', 'company']:
- sentence_text = sentences[ent.sentence_index].sentence_text
- pre_text = sentence_text[max(0, b_idx_fr - 10):b_idx_fr]
- if ent.label in [0,1] and ent.values[ent.label] > 0.8:
- tenderee_or_agency.add(ent.entity_text)
- elif ent.label == 2 and (ent.values[ent.label] > 0.8 or all_winner):
- multi_winner_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
- for j in range(i, len(list_entitys[0])):
- ent_bh = list_entitys[0][j]
- b_idx_bh = ent_bh.wordOffset_begin
- e_idx_bh = ent_bh.wordOffset_end
- if ent_bh.entity_type in ['org','company'] and ent_bh.label in [2,5] and ent_bh.sentence_index == ent.sentence_index and b_idx_bh - e_idx_fr in [1, 2]:
- if sentence_text[e_idx_fr:b_idx_bh] in [';', '、', '&', ',', '/', '//'] and (
- len(sentence_text) == e_idx_bh or sentence_text[e_idx_bh] in [';', '、', '&', ',','/', '//','。']): # 修复多中标人刚好在文末index超出报错,例子 407126558
- multi_winner_l.append((ent_bh.entity_text, ent_bh.sentence_index, ent_bh.wordOffset_begin, ent_bh.in_attachment))
- e_idx_fr = e_idx_bh
- i = j + 1
- else:
- break
- elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh == e_idx_fr: # 两实体间没符号分割情况
- multi_winner_l.append((ent_bh.entity_text, ent_bh.sentence_index, ent_bh.wordOffset_begin, ent_bh.in_attachment))
- e_idx_fr = e_idx_bh
- i = j + 1
- elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and e_idx_fr == e_idx_bh: # 处理 514603520 中国邮政储蓄银行股份有限公司淄博市临淄区支行 实体由于字典匹配重复两次情况
- i = j + 1
- else:
- break
- if re.search('入围', pre_text) and re.search('未入围', pre_text)==None:
- finalists.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
- elif all_winner==1 and ent.label in [3,4,5] and re.search('第[一二三四五六七八九十0-9]+名|候选(人|单位)|入围(单位|供应商)|投标银行', pre_text) and re.search('未', pre_text)==None:
- multi_winner_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
- if len(multi_winner_l)>=2:
- winner_main = [it for it in multi_winner_l if not it[3]]
- winner_attn = [it for it in multi_winner_l if it[3]]
- pack_main = [it for it in package_l if not it[3]]
- pack_attn = [it for it in package_l if it[3]]
- if len(set([it[0] for it in winner_main]))>=2: # 有两个及以上多中标人及多标段 例:441612746
- add_multi_winner(pack_main, winner_main)
- elif len(set([it[0] for it in winner_attn]))>=2:
- add_multi_winner(pack_attn, winner_attn)
- if len(finalists)>=2: # 多入围候选人
- winner_main = [it for it in finalists if not it[3]]
- winner_attn = [it for it in finalists if it[3]]
- pack_main = [it for it in package_l if not it[3]]
- pack_attn = [it for it in package_l if it[3]]
- if len(set([it[0] for it in winner_main]))>=2: # 有两个及以上多中标人及多标段 例:276326152
- add_multi_winner(pack_main, winner_main)
- elif len(set([it[0] for it in winner_attn]))>=2:
- add_multi_winner(pack_attn, winner_attn)
- else:
- for i in range(len(list_entitys[0])):
- ent = list_entitys[0][i]
- if ent.entity_type in ['money']:
- money = float(ent.entity_text)
- if ent.in_attachment:
- moneys_attachment.append(money)
- else:
- moneys.append(money)
- return {'moneys': list(set(moneys)), 'moneys_attachment': list(set(moneys_attachment))}
- def update_prem(old_prem, new_prem, in_attachment=False):
- '''
- 根据新旧对比,更新数据
- :param old_prem:
- :param new_prem: 表格提取的要素
- :return:
- '''
- if len(new_prem) >= 1 :
- '''如果表格提取的包大于2,原来的包比表格提取的包多则删除原来多余的包,以表格的为准'''
- if len(new_prem) >= 2 and (len(new_prem)<len(old_prem) <= len(new_prem)*2 or set(old_prem)&set(new_prem)==set()): # 修复类似443925411 标的+标包才算标段号
- del_k = []
- for k in old_prem:
- if k not in new_prem and k != 'Project':
- del_k.append(k)
- for k in del_k:
- old_prem.pop(k)
- if len(old_prem) > len(new_prem) and len(new_prem)>1 and in_attachment==False: # 如果表格有提取,非表格包数比表格提取多,去掉非表格在附件里提取的包
- del_k = []
- for k in old_prem:
- if 'in_attachment' in old_prem[k] and old_prem[k]['in_attachment'] and k not in new_prem and k != 'Project':
- del_k.append(k)
- for k in del_k:
- old_prem.pop(k)
- if in_attachment: # 附件表格提取的,原来提取有中标人,停止替换
- for v in old_prem.values():
- for d in v['roleList']:
- if d['role_name'] in ['win_tenderer', 'pre_win_tenderer']:
- return 0
- # if len(new_prem) > len(old_prem) and [k for k in new_prem if '自增' not in k] == []: # 如果表格提取包号都为自增编号且包数大于非表格提取,不进行更新 例 244355092 281854766
- # return None
- if len(old_prem) == 2 and len(new_prem) == 1 and ('Project' in new_prem or set(new_prem)&set(old_prem)==set()): # 如果表格提取包为Project,非表格提取两个包且一个包为Project,把表格提取合并到非Project包
- k = list(old_prem.keys()-set(['Project']))[0]
- k_new = list(new_prem.keys())[0]
- new_prem[k] = new_prem.pop(k_new)
- elif len(old_prem) == 1 and len(new_prem) == 1 and 'Project' not in old_prem and set(new_prem)&set(old_prem)==set(): # 如果表格提取包与非表格提取都是一个包且不同,把表格提取包名替换为非表格包名
- k = list(old_prem.keys()-set(['Project']))[0]
- k_new = list(new_prem.keys())[0]
- new_prem[k] = new_prem.pop(k_new)
- if len(new_prem) == len(old_prem) == 1 and 'Project' not in new_prem and 'Project' in old_prem: # 如果表格提取到包号,非表格没提取到,合并到Project
- k = list(new_prem.keys())[0]
- new_prem['Project'] = new_prem[k]
- multi_tendereeMoney = [] # 多包招标金额
- for k, v in new_prem.items():
- if k == 'Project':
- if 'Project' in old_prem:
- tmp_l = [] # 保存新旧同时包含的角色
- if v.get('code', "") != "":
- old_prem['Project']['code'] = v.get('code', "")
- if v.get('name', "") != "":
- old_prem['Project']['name'] = v.get('name', "")
- for d in old_prem['Project']['roleList']:
- for d2 in v['roleList']:
- if d['role_name'] == d2['role_name']: # 同时包含的角色用表格的替换
- tmp_l.append(d2)
- if d2['role_text'] != "":
- d['role_text'] = d2['role_text']
- if d2['serviceTime'] != "":
- d['serviceTime'] = d2['serviceTime']
- if float(d2['role_money']['money']) != 0: # 如果表格提取的金额不为0才替换
- d['role_money']['money'] = d2['role_money']['money']
- d['role_money']['money_unit'] = d2['role_money']['money_unit']
- for k in set(d2)-set(d): # 把表格提取加的属性补充过来,比如:multi_winner other_winner_dic等
- if d2[k]:
- d[k] = d2[k]
- for d2 in v['roleList']:
- if d2 not in tmp_l: # 把新预测有,旧没有的角色添加上去
- old_prem['Project']['roleList'].append(d2)
- if float(new_prem['Project']['tendereeMoney'])!=0:
- old_prem['Project']['tendereeMoney'] = new_prem['Project']['tendereeMoney'] # 20240508 修复 464187225 表格提取纠正招标金额错误
- else:
- old_prem[k] = v
- else:
- if v['tendereeMoney'] != 0:
- multi_tendereeMoney.append(v['tendereeMoney'])
- if k.startswith('自增'): # 表格提取的没找到包号 按行数添加包号,前面加自增,例 自增1
- k = k[2:]
- if k not in old_prem: # 新有旧没有的包直接添加
- old_prem[k] = v
- else:
- tmp_l = [] # 保存新旧同时包含的角色
- if v.get('code', "") != "":
- old_prem[k]['code'] = v.get('code', "")
- if v.get('name', "") != "":
- old_prem[k]['name'] = v.get('name', "")
- for d in old_prem[k]['roleList']:
- for d2 in v['roleList']:
- if d['role_name'] == d2['role_name']:
- tmp_l.append(d2)
- if d2['role_text'] != "":
- d['role_text'] = d2['role_text']
- if d2['serviceTime'] != "":
- d['serviceTime'] = d2['serviceTime']
- if float(d2['role_money']['money']) != 0: # 如果表格提取的金额不为0才替换
- d['role_money']['money'] = d2['role_money']['money']
- d['role_money']['money_unit'] = d2['role_money']['money_unit']
- for k2 in set(d2)-set(d): # 把表格提取加的属性补充过来,比如:multi_winner other_winner_dic等
- if d2[k2]:
- d[k2] = d2[k2]
- for d2 in v['roleList']:
- if d2 not in tmp_l: # 把新预测有,旧没有的角色添加上去
- old_prem[k]['roleList'].append(d2)
- if v['tendereeMoney'] != 0:
- old_prem[k]['tendereeMoney'] = v['tendereeMoney'] # 2024/05/24 使用表格招标金额
- if multi_tendereeMoney and 'Project' in old_prem and float(old_prem['Project']['tendereeMoney'])!=0: # 表格提取到多标段招标金额,去掉Project包招标金额
- old_prem['Project']['tendereeMoney'] = 0
- # return old_prem
- def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMoney=0, name=""):
- '''
- 规则检查纠正prem,如果Project包中标人在其他包中标人,去掉project包中标角色;如果有其他包中标人,去掉roleList为空的包;
- :param prem: prem 字段字典
- :return:
- '''
- if len(prem) > 1: # 表格提取到中标人的,去掉project包中标人
- pro_winner = set()
- other_winner = set()
- other_winner_prob = 0
- pro_winner_prob = 0
- empty_roleList = []
- for k in prem:
- prem[k]['uuid'] = str(uuid.uuid4()) # 20240627 每个包都添加uuid
- if prem[k]['roleList'] == []:
- empty_roleList.append(k)
- for d in prem[k]['roleList']:
- if d['role_name'] in ['win_tenderer', 'pre_win_tenderer', 'second_tenderer','third_tenderer']:
- if k == 'Project':
- pro_winner.add(d['role_text'])
- if 'win_tenderer_joint' in d:
- pro_winner.update(set(d['win_tenderer_joint'].split(',')))
- if 'multi_winner' in d:
- pro_winner.update(set(d['multi_winner'].split(',')))
- if d['role_name'] == 'win_tenderer' and d.get('role_prob', 0)>0.6:
- pro_winner_prob = d.get('role_prob', 0)
- else:
- other_winner.add(d['role_text'])
- if 'win_tenderer_joint' in d:
- other_winner.update(set(d['win_tenderer_joint'].split(',')))
- if 'multi_winner' in d:
- other_winner.update(set(d['multi_winner'].split(',')))
- if d['role_name'] == 'win_tenderer' and d.get('role_prob', 0)>0.6:
- other_winner_prob = d.get('role_prob', 0)
- if pro_winner!=set() and (pro_winner & other_winner != set() or other_winner_prob>pro_winner_prob): # 如果默认包与其他包中标人重复或其他包中标人概率比默认包大,删除默认包中标人
- prem['Project']['roleList'] = [d for d in prem['Project']['roleList'] if
- d['role_name'] not in ['win_tenderer', 'second_tenderer',
- 'third_tenderer']]
- elif other_winner_prob<pro_winner_prob and len(prem)==2: # 两个包情况,如果默认包中标人概率比其他包大,删除其他包
- rm_k = [k for k in prem if k != 'Project']
- for k in rm_k:
- prem.pop(k)
- if other_winner and channel_dic['docchannel']['docchannel'] in ['中标信息', '候选人公示', '合同公告']:
- for k in empty_roleList:
- prem.pop(k)
- elif "Project" in prem:
- prem['Project']['uuid'] = str(uuid.uuid4())
- if is_deposit_project and float(total_tendereeMoney)!=0 and len(prem)==1: #20241107 存款类项目有总投资没招标金额且只有一个标段,把总投资作招标金额
- for k in prem:
- if float(prem[k]['tendereeMoney'])==0:
- prem[k]['tendereeMoney'] = total_tendereeMoney
- if name != '' and len(prem)<=2: # 20241129 小于等于两个包且无包名称,取项目名称
- for k in prem:
- if prem[k].get('name', '') == '':
- prem[k]['name'] = name
- def fix_single_source(prem, channel_dic, original_docchannel):
- if prem.get('bidway', '') == '单一来源' and channel_dic['docchannel']['docchannel'] == '招标公告' and original_docchannel==52:
- for l in prem['prem'].values():
- for d in l['roleList']:
- if d['role_name'] == "win_tenderer":
- d['role_name'] = 'pre_win_tenderer'
- if __name__=="__main__":
- '''
- conn = getConnection()
- cursor = conn.cursor()
- #sql = " select distinct A.doc_id from entity_mention A,test_predict_role B where A.entity_id=B.entity_id limit 200"
- sql = " select B.doc_id,B.prem from articles_processed A, articles_validation B where A.id=B.doc_id "
-
- result = []
-
- cursor.execute(sql)
- rows = cursor.fetchall()
- count = 0
- for row in rows:
-
- count += 1
- # print(count)
- doc_id = row[0]
-
- roleList = getPackageRoleMoney(doc_id)
- result.append([doc_id,str(roleList),row[1]])
- ''''''
- with codecs.open("getAttribute.html","w",encoding="utf8") as f:
- f.write('<html><head>\
- <meta http-equiv="Content-Type"\
- content="text/html; charset=UTF-8">\
- </head>\
- <body bgcolor="#FFFFFF">\
- <table border="1">\
- <tr>\
- <td>doc_id</td>\
- <td>角色</td>\
- </tr>')
- for item in result:
- f.write("<tr>"+"<td>"+item[0]+"</td>"+"<td>"+item[1]+"</td>"+"<td>"+item[2]+"</td>"+"</tr>")
- f.write("</table></body>")
- '''
|