pb_extract.py 78 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680
  1. import copy
  2. import os
  3. import re
  4. import sys
  5. import traceback
  6. from decimal import Decimal
  7. import pandas as pd
  8. from bs4 import BeautifulSoup
  9. sys.path.append(os.path.abspath(os.path.dirname(__file__)) + '/../../../')
  10. from BiddingKG.dl.common.Utils import spanWindow, timeFormat
  11. class PBPredictor:
  12. def __init__(self):
  13. self.stage_pattern, self.stage_priority_dict = get_stage_pattern()
  14. self.industry_pattern = get_industry_pattern()
  15. self.property_pattern, self.property_priority_dict = get_property_pattern()
  16. with open(os.path.abspath(os.path.dirname(__file__)) + '/structure_keyword.txt', 'r', encoding='utf-8') as f:
  17. self.structure_keyword_list = f.readlines()
  18. def get_col_from_prem(self, prem):
  19. tenderee, agency, product = None, None, None
  20. for item in prem:
  21. prem = item.get('prem')
  22. for key in prem.keys():
  23. project = prem.get(key)
  24. role_list = project.get('roleList')
  25. for role_dict in role_list:
  26. if tenderee is None and role_dict.get('role_name') == 'tenderee':
  27. tenderee = role_dict.get('role_text')
  28. if agency is None and role_dict.get('role_name') == 'agency':
  29. agency = role_dict.get('role_text')
  30. product = item.get('product')
  31. begin_time = item.get('time_commencement')
  32. end_time = item.get('time_completion')
  33. return tenderee, agency, product, begin_time, end_time
  34. def predict(self, prem, list_articles, list_sentences, list_entitys, doctitle, code_name_dict, dochtmlcon):
  35. try:
  36. for list_article, list_sentence, list_entity in zip(list_articles, list_sentences, list_entitys):
  37. list_sentence.sort(key=lambda x: x.sentence_index)
  38. tenderee, agency, product, begin_time, end_time = self.get_col_from_prem(prem)
  39. content = list_article.content
  40. dochtmlcon = re.sub('[\r\n]', '', dochtmlcon)
  41. # 获取无附件的html
  42. soup = BeautifulSoup(dochtmlcon, 'lxml')
  43. attachment_div = soup.find('div', class_='richTextFetch')
  44. if attachment_div:
  45. attachment_div.decompose()
  46. content_no_att = soup.text
  47. else:
  48. content_no_att = ''
  49. project_name = code_name_dict.get('name')
  50. project_code = code_name_dict.get('code')
  51. if project_code:
  52. project_code = project_code[0]
  53. else:
  54. project_code = None
  55. stage = extract_legal_stage(project_name+doctitle, self.stage_pattern, self.stage_priority_dict, product, tenderee=tenderee, agency=agency)
  56. industry1 = extract_industry(doctitle+content, self.industry_pattern)
  57. industry = extract_industry(doctitle+content_no_att, self.industry_pattern)
  58. # print('industry', industry, industry1)
  59. if not industry and industry1:
  60. industry = industry1
  61. proportion1, proportion = extract_proportion(content)
  62. project_digest = extract_project_digest(content)
  63. project_address = extract_project_address(list_sentence, list_entity)
  64. location = get_bid_location(doctitle+"\t"+project_name)
  65. project_name_refind, show_name_refind = get_project_name_refind(project_name, doctitle, tenderee, agency)
  66. has_elevator = extract_has_elevator(content)
  67. project_property = extract_project_property(doctitle+"\t"+project_name, self.property_pattern, self.property_priority_dict)
  68. total_invest, construct_install_fee, engineer_cost = extract_several_money(list_sentence, dochtmlcon)
  69. max_floor = extract_max_floor(content, dochtmlcon)
  70. structure = extract_structure(content, dochtmlcon, self.structure_keyword_list)
  71. has_steel = extract_has_steel_structure(structure)
  72. wall_type, wall_type2 = extract_wall_type(doctitle+"\t"+project_name, content)
  73. if stage is not None:
  74. has_stage = 1
  75. else:
  76. has_stage = 0
  77. pb_json = {
  78. 'tenderee': tenderee,
  79. 'agency': agency,
  80. 'project_code': project_code,
  81. 'project_name': project_name,
  82. 'doctitle': doctitle,
  83. 'stage': stage,
  84. 'industry': industry,
  85. 'proportion': proportion,
  86. 'projectDigest': project_digest,
  87. 'projectAddress': project_address,
  88. 'location': location,
  89. 'project_name_refind': project_name_refind,
  90. 'has_elevator': has_elevator,
  91. 'project_property': project_property,
  92. 'total_invest': total_invest,
  93. 'construct_install_fee': construct_install_fee,
  94. 'engineer_cost': engineer_cost,
  95. 'max_floor': max_floor,
  96. 'structure': structure,
  97. 'has_steel': has_steel,
  98. 'wall_type': wall_type,
  99. 'wall_type2': wall_type2,
  100. 'begin_time': begin_time,
  101. 'end_time': end_time,
  102. 'has_stage': has_stage,
  103. }
  104. pb_json = {'pb': pb_json}
  105. return pb_json
  106. except:
  107. traceback.print_exc()
  108. return {'pb': 'error'}
  109. def extract_legal_stage(content, _pattern, priority_dict, product='', tenderee='', agency=''):
  110. # 判断这几类直接返回
  111. if not content:
  112. return None
  113. if re.search("拍卖|转让|产权|出让|租赁|招租", content) is not None:
  114. return None
  115. # 去掉招标人代理人
  116. content = re.sub(str(tenderee), '', content)
  117. content = re.sub(str(agency), '', content)
  118. # 竣工阶段标志:消防、物业、安保、装修、通风系统、排烟、第三方检测
  119. # if re.search("消防|物业|安保|装修|通风系统|排烟|第三方检测", content) is not None:
  120. # return '竣工阶段'
  121. # 替换混淆词
  122. _content = re.sub("设计院|设计总院|造价咨询有限公司", "", content)
  123. list_stage = []
  124. for stage_search in re.finditer(_pattern, _content):
  125. for k,v in stage_search.groupdict().items():
  126. if v is not None:
  127. list_stage.append([k, priority_dict.get(k)])
  128. if len(list_stage) > 0:
  129. list_stage.sort(key=lambda x: x[1])
  130. stage = list_stage[0][0]
  131. # 用product判断竣工阶段是否合法
  132. if product:
  133. if not re.search('施工|工程|建设', str(product)):
  134. stage = None
  135. for s in list_stage:
  136. if s[0] != '竣工阶段':
  137. stage = s[0]
  138. break
  139. # 立项排除立项目
  140. if stage == '立项阶段':
  141. sub_content = re.sub('立项目', '', _content)
  142. for stage_search in re.finditer(_pattern, sub_content):
  143. for k,v in stage_search.groupdict().items():
  144. if v is not None:
  145. list_stage.append([k, priority_dict.get(k)])
  146. if len(list_stage)>0:
  147. list_stage.sort(key=lambda x: x[1])
  148. stage = list_stage[0][0]
  149. return stage
  150. return None
  151. def get_project_name_refind(project_name, doctitle, tenderee='', agency= '', min_len=3):
  152. # 跳过部分
  153. re_str11 = '网上超市|服务市场采购|印刷服务|复印纸|车辆维修和保养|商品房预售|办公家具定点|直接订购|定点议价' \
  154. '|政府采购意向|信息技术服务定点议价|信息技术服务定点采购|法人章刻制中介机构|专用设备|办公设备采购' \
  155. '|线上摇号选取'
  156. re_str12 = '物业'
  157. re_str13 = '公共资源交易平台'
  158. re_str19 = '环境影响评价(文件|)(审批|审核|受理)|拟作出的建设'
  159. # 干扰部分
  160. re_str1 = "<.*?>[.*?]|{.*?}|〔.*?〕|《.*?》|【.*?】|\(.*?\)|\[.*?\]|(.*?)|\d{1,2}月\d{1,2}[日号]|\d{1,2}:\d{2}"
  161. re_str4 = '[,.:;,。:;\'\"“”‘’\-/<>#@!$%&*+=·¥|??|-+#"﹝﹒!]'
  162. re_str5 = '[工程项目建设拟对年批第作出的个及在已]|标段|EPC|总承包|招标|文件|开标|记录|公示|验收|勘察|编制|公开选取|准予|论证|各单位|附件|建筑业' \
  163. '|责任|诚挚|拟作出审批意见|生产|涉及|消防|政府|投资|方案|技术支持|文件|研发|申请报告|出具|现代|产业|依法|报批|行政|审批|许可|开展' \
  164. '|活动|开放日|系列|某部|零星工程|某(地产|型号|单位)|权限内|办理了|外商|我院|召开|我市|启动|我单位|我(县|区|会)|成功|举办|举行' \
  165. '|被评为|征(求|询)|包括|不包括|层层分包|合同估算价|万元以内|组织|全(市|区|县)|承接|积极|针对|企业|小规模|安全|助推|装修|改造' \
  166. '|新建|居住|技术|建设|建筑|安装|园林|绿化|信息化|采购|商品房|预售|许可|房产|测量|报告|业务|零星|维修|水土保持|扩建|夜间|工地' \
  167. '|整治|高速公路|备案|加油站|设施|环境|保护|合同|履约|在线|询价|面积|受理|经济中心|服务|食品|加工|利用|公开|选取|动物|疫苗|框架' \
  168. '|协议|房屋|中国|不动产|实验室|限额|以下|单位|入围|审查|合格|意见|新能源|常规|许可|申请|加工|制品|建议书|可研|结算|审核|遴选' \
  169. ''
  170. re_str6 = '总承包|设计|环评|监理|施工|竣工|项目|工程|EPC|验收|勘察设计|全过程造价咨询|造价咨询|勘察|可行性研究报告|初步设计|社会稳定风险评估|测绘' \
  171. '|(地震安全性|环境影响|水土保持)评(价估)'
  172. re_str7 = '许可信息公开表|办理结果公示|审批信息公开表|验收公示表|信息披露|备案登记表|验收结果公示|' \
  173. '审批意见公开|受理公示|施工许可|情况说明|合同纠纷调解|施工许可双公示|施工许可证|政策'
  174. re_str8 = '[〔〕()\[\]《》()【】{}{}[]<>]'
  175. re_str14 = '[〔(\[(【{{[<](采购结果|设计|环评|监理|施工|竣工|工程|EPC|验收|勘察设计|全过程造价咨询|造价咨询|勘察)[〕)\]》)】}}]>]'
  176. # 截取部分
  177. re_str2 = '机场快线|科技园|产业园|工业园|工程|项目|施工|竣工|总承包|改造|监理|可研|验收|勘察设计|全过程造价咨询|造价咨询|勘察|可行性研究报告|EPC|初步设计|社会稳定风险评估'
  178. re_str3 = '关于(公布核准的|一种用于|核发|作出|同意|开展|调整|请求|规范|要求|进一步|遴选|领取)|关于公[布开示]|关于[为对]|关于|公司|集团|局|委托'
  179. re_str9 = '改扩建|建设|迁改|土建|测绘|(地震安全性|环境影响|水土保持)评(价估)'
  180. # 混淆部分
  181. re_str10 = '局部'
  182. re_str17 = '(工程|信息|)有限|公司|集团|局|大学|院|学校|学院|中心'
  183. re_str18 = '(设计|造价|咨询|建设|项目|管理|工程)+有限|(信息|职业|技术|管理)+(大学|学校|学院|中心|院|所)'
  184. re_str26 = '服务类|(设计|造价|咨询|建设|项目|管理|工程)+(大学|学校|学院|中心|院|所|集团|局)'
  185. # 需判断删除部分
  186. re_str15 = '[ 、#※.?<|①=-_—]|=-|##|\*+|[\((](001|[0-9]|[一二三四五六七八九十]+|)[\))]|(0001|0000|001|002|01|02)+'
  187. re_str16 = '[0-9][.、]'
  188. # 删除特定表达
  189. re_str20 = '公共资源交易中心.*关于'
  190. re_str21 = '[\u4e00-\u9fff]{2,}市[\u4e00-\u9fff]{2,}区'
  191. re_str22 = '[\u4e00-\u9fff]{2,4}区[^至]'
  192. re_str23 = '.{1,2}招标公告|(PDF|pdf)(版|)'
  193. re_str25 = '(小区)$'
  194. re_str27 = '[\u4e00-\u9fff]{2,3}省|[\u4e00-\u9fff]{2,3}市'
  195. re_str_area = '华北|华南|华东|华中|西南|东北|西北'
  196. re_str_province = '北京|天津|河北|山西|内蒙古|广东|海南|广西|上海|江苏|浙江|安徽|福建|江西|山东|河南|湖北|湖南|重庆|四川|贵州|云南|西藏|黑龙江|辽宁|吉林|陕西|甘肃|青海|宁夏|新疆|台湾|香港|澳门'
  197. re_str_city = '东城|和平|石家庄|唐山|秦皇岛|邯郸|邢台|保定|张家口|承德|沧州|廊坊|衡水|太原|大同|阳泉|长治' \
  198. '|晋城|朔州|晋中|运城|忻州|临汾|吕梁|呼和浩特|包头|乌海|赤峰|通辽|鄂尔多斯|呼伦贝尔|巴彦淖尔' \
  199. '|乌兰察布|兴安盟|锡林郭勒盟|阿拉善盟|广州|韶关|深圳|珠海|汕头|佛山|江门|湛江|茂名|肇庆|惠州' \
  200. '|梅州|汕尾|河源|阳江|清远|潮州|揭阳|云浮|海口|三亚|南宁|柳州|桂林|梧州|北海|防城港|钦州|贵港' \
  201. '|玉林|百色|贺州|河池|来宾|崇左|黄浦|南京|无锡|徐州|常州|苏州|南通|连云港|淮安|盐城|扬州|镇江' \
  202. '|泰州|宿迁|杭州|宁波|温州|嘉兴|湖州|绍兴|金华|衢州|舟山|台州|丽水|合肥|芜湖|蚌埠|淮南|马鞍山' \
  203. '|淮北|铜陵|安庆|黄山|滁州|阜阳|宿州|六安|亳州|池州|宣城|福州|厦门|莆田|三明|泉州|漳州|南平' \
  204. '|龙岩|宁德|南昌|景德镇|萍乡|九江|新余|鹰潭|赣州|吉安|宜春|抚州|上饶|济南|青岛|淄博|枣庄' \
  205. '|东营|烟台|潍坊|济宁|泰安|威海|日照|临沂|德州|聊城|滨州|菏泽|郑州|开封|洛阳|平顶山|安阳|鹤壁' \
  206. '|新乡|焦作|濮阳|许昌|漯河|三门峡|南阳|商丘|信阳|周口|驻马店|武汉|黄石|十堰|宜昌|襄阳|鄂州' \
  207. '|荆门|孝感|荆州|黄冈|咸宁|随州|恩施土家族|长沙|株洲|湘潭|衡阳|邵阳|岳阳|常德|张家界|益阳' \
  208. '|郴州|永州|怀化|娄底|湘西土家族|万州|成都|自贡|攀枝花|泸州|德阳|绵阳|广元|遂宁|内江|乐山' \
  209. '|南充|眉山|宜宾|广安|达州|雅安|巴中|资阳|阿坝藏族羌族|甘孜藏族|凉山彝族|贵阳|六盘水|遵义' \
  210. '|安顺|铜仁|黔西南布依族|毕节|黔东南苗族|黔南布依族|昆明|曲靖|玉溪|保山|昭通|丽江|普洱|临沧' \
  211. '|楚雄彝族|红河哈尼族|文山|西双版纳傣族|大理白族|德宏傣族景颇族|怒江傈僳族|迪庆藏族|拉萨|昌都' \
  212. '|山南|日喀则|那曲|阿里地区|林芝|哈尔滨|齐齐哈尔|鸡西|鹤岗|双鸭山|大庆|伊春|佳木斯|七台河' \
  213. '|牡丹江|黑河|绥化|大兴安岭|沈阳|大连|鞍山|抚顺|本溪|丹东|锦州|营口|阜新|辽阳|盘锦|铁岭' \
  214. '|朝阳|葫芦岛|长春|吉林|四平|辽源|通化|白山|松原|白城|延边朝鲜族|西安|铜川|宝鸡|咸阳|渭南' \
  215. '|延安|汉中|榆林|安康|商洛|兰州|嘉峪关|金昌|白银|天水|武威|张掖|平凉|酒泉|庆阳|定西|陇南' \
  216. '|临夏回族自治州|甘南藏族|西宁|海东|海北藏族|黄南藏族|海南藏族|果洛藏族|玉树藏族|海西蒙古族' \
  217. '|银川|石嘴山|吴忠|固原|中卫|乌鲁木齐|克拉玛依|吐鲁番|哈密|昌吉|博尔塔拉蒙古|巴音郭楞蒙古' \
  218. '|阿克苏|克孜勒苏柯尔克孜|喀什|和田地区|伊犁|伊犁哈萨克|塔城地区|阿勒泰|中山|东莞|天门|仙桃|潜江' \
  219. '|石河子|五家渠|阿拉尔|图木舒克|三沙|儋州|涪陵|永川|西城|朝阳|丰台|石景山|海淀|门头沟' \
  220. '|房山|通州|顺义|昌平|大兴|怀柔|平谷|密云|延庆|河东|河西|河北区|红桥|东丽|西青|津南|北辰' \
  221. '|武清|宝坻|滨海|宁河|静海|蓟州|渝中|大渡口|江北|沙坪坝|九龙坡|南岸|北碚|綦江|大足|渝北' \
  222. '|巴南|黔江|长寿|江津|合川|南川|璧山|铜梁|潼南|荣昌|开州|徐汇|长宁|静安|普陀|虹口|杨浦' \
  223. '|闵行|宝山|嘉定|浦东新|金山|松江|青浦|奉贤|崇明|济源|神农架林区|五指山|文昌|琼海|万宁' \
  224. '|东方|定安|屯昌|澄迈|临高|白沙黎族|昌江黎族|乐东黎族|陵水黎族|保亭黎族|琼中黎族|梁平' \
  225. '|丰都|城口|垫江|忠县|云阳|奉节|巫山|巫溪|石柱|秀山|武隆|酉阳|彭水|南开|北屯|铁门关' \
  226. '|双河|可克达拉|昆玉|胡杨河'
  227. re_str28 = '({})(地区)?|({})省?|({})[区市]?'.format(re_str_area, re_str_province, re_str_city)
  228. re_str29 = '(({})(地区)?({})省?)|(({})省?({})[区市]?)'.format(re_str_area, re_str_province, re_str_province, re_str_city)
  229. # 直接删除部分
  230. re_str24 = '(的|)(竞争性谈判|竞争性磋商|磋商|中标|单一来源|招标|更正)(采购|)(公告|)'
  231. add_col = project_name if project_name else '' + doctitle if doctitle else ''
  232. if re.search(re_str11, add_col) and not re.search(re_str12, add_col):
  233. return '', ''
  234. from_col_list = [project_name, doctitle]
  235. name_refind_flag_dict = {'True': [], 'False': []}
  236. for col in from_col_list:
  237. name_refind = ""
  238. match_flag = False
  239. if col is not None and len(col) > 0:
  240. name_refind = col
  241. # 部分跳过
  242. if re.search(re_str13, name_refind):
  243. continue
  244. # 替换特定表达
  245. match = re.search(re_str20, name_refind)
  246. if match:
  247. name_refind = name_refind[match.span()[1]:]
  248. # 去掉干扰
  249. name_refind = re.sub('年度', '年', name_refind)
  250. name_refind = re.sub(re_str4, '', name_refind)
  251. name_refind = re.sub(re_str14, '', name_refind)
  252. # print('name_refind', name_refind)
  253. # 连续截取工程前的,看哪一部分最适合当refind
  254. match = re.finditer(re_str2, name_refind)
  255. prob_name_list = []
  256. last_index = 0
  257. project_word_in_org = []
  258. for m in match:
  259. # 混淆词,设施工程中的施工
  260. if m.span()[0] > 0 and name_refind[m.span()[0]-1] in ['设']:
  261. continue
  262. # 判断是不是公司名里的工程
  263. if re.search(re_str26, name_refind[m.span()[1]:]):
  264. project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))])
  265. continue
  266. if re.search(re_str17, name_refind[m.span()[1]:m.span()[1]+3]):
  267. project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))])
  268. continue
  269. if re.search(re_str18, name_refind[m.span()[1]:]):
  270. project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))])
  271. continue
  272. match_flag = True
  273. prob_name_list.append(name_refind[last_index:m.span()[1]])
  274. last_index = m.span()[1]
  275. # print('match_flag', match_flag, name_refind)
  276. # 找不到则用第二套截取
  277. if not prob_name_list:
  278. match = re.finditer(re_str9, name_refind)
  279. last_index = 0
  280. for m in match:
  281. # 混淆词,设施工程中的施工
  282. if m.span()[0] > 0 and name_refind[m.span()[0]-1] in ['设']:
  283. continue
  284. # 判断是不是公司名里的工程
  285. if re.search(re_str26, name_refind[m.span()[1]:]):
  286. project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))])
  287. continue
  288. if re.search(re_str17, name_refind[m.span()[1]:m.span()[1]+3]):
  289. project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))])
  290. continue
  291. if re.search(re_str18, name_refind[m.span()[1]:]):
  292. project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))])
  293. continue
  294. match_flag = True
  295. prob_name_list.append(name_refind[last_index:m.span()[1]])
  296. last_index = m.span()[1]
  297. if not prob_name_list:
  298. prob_name_list = [name_refind]
  299. # print('prob_name_list', prob_name_list)
  300. # print('project_word_in_org', project_word_in_org)
  301. # 一开始不去掉括号里的内容,截取后再去掉
  302. for i, name in enumerate(prob_name_list):
  303. # 括号内容大于一半字数,则不去掉括号中的字
  304. match = re.search(re_str1, name)
  305. # print('name', name)
  306. # print('match', match)
  307. if match and len(match.group()) < len(name) / 2:
  308. name = re.sub(re_str1, "", name)
  309. name = re.sub(re_str8, "", name)
  310. prob_name_list[i] = name
  311. # 判断refind是否合法
  312. # print('prob_name_list2', prob_name_list)
  313. name_refind = ''
  314. for name in prob_name_list:
  315. # 截取公司后的
  316. match = re.finditer(re_str3, name)
  317. prob_name_list2 = []
  318. for m in match:
  319. # 排除混淆的情况
  320. if m.group() in re_str10 and re.search(re_str10, name):
  321. continue
  322. prob_name_list2.append(name[m.span()[1]:])
  323. if prob_name_list2:
  324. name = prob_name_list2[-1]
  325. # 剔除工程类判断词
  326. match1 = re.finditer(re_str6, name)
  327. for m1 in match1:
  328. # 混淆词,设施工程中的施工
  329. if m1.span()[0] > 0 and name[m1.span()[0]-1] in ['设']:
  330. continue
  331. s_index, e_index = m1.span()
  332. word = name[s_index:e_index]
  333. s_index = s_index - 1 if s_index > 0 else 0
  334. e_index = e_index + 1 if e_index < len(name) else len(name)
  335. word1 = name[s_index:e_index]
  336. if word1 in project_word_in_org:
  337. continue
  338. name = re.sub(re.escape(word), '=' * len(word), name)
  339. name = re.sub('={2,}', "", name)
  340. # 剔除一些无关词占用长度
  341. if len(re.findall('[\u4e00-\u9fff]', name)) >= min_len \
  342. and len(re.findall('[\u4e00-\u9fff]', re.sub(re_str5, '', name))) >= min_len:
  343. name_refind = name
  344. break
  345. if match_flag:
  346. name_refind_flag_dict['True'] += [name_refind]
  347. else:
  348. name_refind_flag_dict['False'] += [name_refind]
  349. # print('name_refind_flag_dict', name_refind_flag_dict)
  350. true_list = name_refind_flag_dict.get('True')
  351. false_list = name_refind_flag_dict.get('False')
  352. name_refind_candidate_list = []
  353. if true_list:
  354. true_list.sort(key=lambda x: len(x), reverse=True)
  355. name_refind = true_list[0]
  356. name_refind_candidate_list += true_list
  357. # else:
  358. # name_refind = ''
  359. if false_list:
  360. false_list.sort(key=lambda x: len(x), reverse=True)
  361. name_refind_candidate_list += false_list
  362. # 对候选name_refind循环
  363. name_refind = ''
  364. show_name_refind = ''
  365. for name_refind in name_refind_candidate_list:
  366. # 直接判断删除数字
  367. match = re.match(re_str16, name_refind)
  368. if match and not re.match('[0-9]', name_refind[match.span()[1]:match.span()[1]+1]):
  369. name_refind = name_refind[match.span()[1]:]
  370. # 删除开头奇怪数字
  371. match = re.match(re_str15, name_refind)
  372. if match and not re.match('[a-zA-Z地块号]', name_refind[match.span()[1]:match.span()[1]+1]):
  373. name_refind = name_refind[match.span()[1]:]
  374. # 删除期数
  375. name_refind = re.sub('[1-9一二三四五六七八九十]期', '', name_refind)
  376. # 跳过'xx省xx市'
  377. if re.search(re_str21, name_refind):
  378. sub_word = re.sub(re_str21, '', name_refind)
  379. sub_word = re.sub(re_str2 + '|' + re_str9, '', sub_word)
  380. if len(sub_word) <= 1:
  381. name_refind = ''
  382. continue
  383. match27 = re.search(re_str27, name_refind)
  384. if match27 and len(match27.group()) == len(name_refind):
  385. name_refind = ''
  386. continue
  387. match28 = re.search(re_str28, name_refind)
  388. if match28 and len(match28.group()) == len(name_refind):
  389. name_refind = ''
  390. continue
  391. match29 = re.search(re_str29, name_refind)
  392. if match29 and len(match29.group()) == len(name_refind):
  393. name_refind = ''
  394. continue
  395. # 删除类似'招标公告'表达
  396. match2 = re.match(re_str23, name_refind)
  397. if match2:
  398. name_refind = name_refind[match2.span()[1]:]
  399. name_refind = re.sub(re_str24, '', name_refind)
  400. # 跳过文件审批
  401. if re.search(re_str19, name_refind):
  402. name_refind = ''
  403. continue
  404. # 跳过网上超市
  405. if re.search(re_str11, name_refind):
  406. name_refind = ''
  407. continue
  408. show_name_refind = copy.deepcopy(name_refind)
  409. # 删除区
  410. match2 = re.match(re_str22, name_refind)
  411. if match2:
  412. name_refind = name_refind[match2.span()[1]-1:]
  413. # 删除'小区表达'
  414. if len(name_refind) >= min_len + 2:
  415. name_refind = re.sub(re_str25, '', name_refind)
  416. # 判断name_refind是否是从公司中来的,过滤
  417. if tenderee in [None, 'None', '-', '']:
  418. tenderee = ''
  419. if agency in [None, 'None', '-', '']:
  420. agency = ''
  421. try:
  422. if len(name_refind) >= 4 and (re.search(re.escape(name_refind[-4:]), tenderee) or re.search(re.escape(name_refind[-4:]), agency)):
  423. name_refind = ''
  424. show_name_refind = ''
  425. except:
  426. pass
  427. # 判断长度
  428. if len(name_refind) < min_len:
  429. name_refind = ''
  430. show_name_refind = ''
  431. continue
  432. break
  433. return name_refind, show_name_refind
  434. def extract_industry(content, _pattern):
  435. list_stage = []
  436. stage_dict = {}
  437. for stage_search in re.finditer(_pattern, content):
  438. for k,v in stage_search.groupdict().items():
  439. if v is not None:
  440. list_stage.append(k)
  441. if k in stage_dict.keys():
  442. stage_dict[k] += 1
  443. else:
  444. stage_dict[k] = 1
  445. if len(list_stage)>0:
  446. stage_cnt_list = [[x, stage_dict.get(x)] for x in stage_dict.keys()]
  447. stage_cnt_list.sort(key=lambda x: x[1], reverse=True)
  448. # print('extract_industry ' + str(stage_cnt_list))
  449. return stage_cnt_list[0][0]
  450. # return list_stage[0]
  451. return None
  452. def extract_project_code_name(list_entity):
  453. project_code = None
  454. project_name = None
  455. for p_entity in list_entity:
  456. if p_entity.entity_type == "name":
  457. project_name = p_entity.entity_text
  458. elif p_entity.entity_type == "code":
  459. project_code = p_entity.entity_text
  460. if project_name and project_code:
  461. break
  462. return project_code, project_name
  463. def extract_tenderee(list_entity):
  464. tenderee = None
  465. for p_entity in list_entity:
  466. if str(p_entity.label) == "0":
  467. tenderee = p_entity.entity_text
  468. break
  469. return tenderee
  470. def extract_project_digest(content):
  471. _pattern = "(?P<projectDigest>(项目|工程|标的|需求|建设|招标|采购|内容)(概况|规模|简介|信息|范围|内容|说明|摘要).{10,300})"
  472. _pattern_search = re.search(_pattern,content)
  473. _projectDigest = ""
  474. _find = ""
  475. if _pattern_search is not None:
  476. _find = _pattern_search.groupdict().get("projectDigest","")
  477. if len(_find)>0:
  478. _projectDigest = "。".join(_find.split("。")[0:3])
  479. # 截掉中标信息
  480. if len(_projectDigest) >= 10:
  481. _projectDigest = cut_win_bid_part(_projectDigest)
  482. if len(_projectDigest) < 10:
  483. _projectDigest = ""
  484. return _projectDigest
  485. def extract_project_address(list_sentence, list_entity):
  486. reg1 = "(项目|建设|工程)所在区域(位于|为|)[::]?"
  487. reg2 = "(项目|建设|工程)(地址|地点|)(位于|起于)[::]?"
  488. reg3 = "(项目|建设|工程)(地址|地点)[::]?(位于|起于)"
  489. reg4 = "(项目|建设|工程)(地址|地点)[为::]+"
  490. address_list = []
  491. candidate_list = []
  492. for sentence in list_sentence:
  493. for reg in [reg1, reg2, reg3, reg4]:
  494. content = sentence.sentence_text
  495. match = re.finditer(reg, content)
  496. for m in match:
  497. end_index = m.span()[1]
  498. for p_entity in list_entity:
  499. if p_entity.sentence_index != sentence.sentence_index:
  500. continue
  501. if p_entity.entity_type != "location" and p_entity.entity_type != "org":
  502. continue
  503. text = p_entity.entity_text
  504. if text == content[end_index:end_index+len(text)] or text in content[end_index:end_index+len(text)+10]:
  505. address_list.append(text)
  506. else:
  507. candidate_list.append(content[max(0, end_index-10):end_index] + '@@@' + content[end_index:end_index+20] + '@@@' + text)
  508. if address_list:
  509. break
  510. if not address_list:
  511. # for can in candidate_list:
  512. # logging.info('candidate ' + can)
  513. return None
  514. else:
  515. address_list.sort(key=lambda x: len(x), reverse=True)
  516. # for address in address_list:
  517. # logging.info('address ' + address)
  518. address = address_list[0]
  519. return address_list[0]
  520. def extract_begin_end_time(list_sentence, list_entity):
  521. _begin_time = None
  522. _end_time = None
  523. for p_entity in list_entity:
  524. if p_entity.entity_type == "time":
  525. for _sentence in list_sentence:
  526. if _sentence.sentence_index == p_entity.sentence_index:
  527. _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text)
  528. if re.search("开工(时间|日期)",_span[0]) is not None:
  529. _time_temp = timeFormat(p_entity.entity_text)
  530. if len(_time_temp)>0:
  531. _begin_time = _time_temp
  532. if re.search("(竣工|完工)(时间|日期)",_span[0]) is not None:
  533. _time_temp = timeFormat(p_entity.entity_text)
  534. if len(_time_temp)>0:
  535. _end_time = _time_temp
  536. return _begin_time,_end_time
  537. def get_bid_location(content):
  538. """
  539. 获取标段工程地点
  540. """
  541. import BiddingKG.dl.interface.Preprocessing as Preprocessing
  542. content = re.sub("[,,.。、\-\(\)()— #+~“”'﹙{}-]", "", content)
  543. # 预处理
  544. content_html = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"><html><body><div>' + content + "</div></body></html>"
  545. list_articles, list_sentences, list_entitys, list_outlines, _cost_time \
  546. = Preprocessing.get_preprocessed([[0, content_html, "", "", content, "", ""]],
  547. useselffool=True)
  548. # 获取location
  549. location = None
  550. for entitys in list_entitys:
  551. for entity in entitys:
  552. if entity.entity_type == "location":
  553. if re.search("[街路村县区溪湖河城厂]|[小中大]学|学[校院]", entity.entity_text):
  554. location = entity.entity_text
  555. break
  556. # location还未空,寻找类似 '薛家湾(张家圪旦)至柳青'的表达
  557. if not location:
  558. match = re.search('.{2,4}至.{2,4}', content)
  559. if match:
  560. location = match.group()
  561. # 判断location不在一些特定实体里
  562. find_flag = False
  563. for entitys in list_entitys:
  564. for entity in entitys:
  565. if entity.entity_type in ["tenderee", 'agency', 'win_tenderer', 'second_tenderer', 'third_tenderer', 'company', 'org']:
  566. if location in entity.entity_text:
  567. find_flag = True
  568. break
  569. if find_flag:
  570. break
  571. if find_flag:
  572. location = None
  573. return location
  574. def extract_proportion(content, has_preffix=True):
  575. if not content:
  576. return "", ""
  577. # log("content")
  578. # log(content)
  579. suffix = "[大概约为是::【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+2?))"
  580. reg_dict = {
  581. 0: "(?P<proportion>(总((建筑|建设)面积|长|长度))" + suffix,
  582. 1: "(?P<proportion>((建筑|建设)面积|全长)" + suffix,
  583. 2: "(?P<proportion>((建筑|建设|区域)?面积|全长|项目规模)" + suffix
  584. }
  585. if not has_preffix:
  586. reg_dict[3] = "(?P<proportion>" + suffix
  587. _proportion = ""
  588. for i in range(len(list(reg_dict.keys()))):
  589. if _proportion:
  590. break
  591. _pattern = reg_dict.get(i)
  592. # logging.info('content ' + str(content))
  593. match = re.search(_pattern, str(content))
  594. if match:
  595. _proportion = match.groupdict().get("proportion","")
  596. if not _proportion:
  597. return "", ""
  598. # 统一格式
  599. multiple_cnt = 1
  600. digit = ""
  601. # 确定具体数字
  602. match = re.search('(?P<d1>[\d,]+)(?P<d2>(\.\d+)?)', _proportion)
  603. if match:
  604. # logging.info(str(_proportion) + ' ' + str(match.group()))
  605. d1 = match.group('d1')
  606. d2 = match.group('d2')
  607. try:
  608. d1 = int(re.sub(',', '', d1))
  609. except:
  610. return "", ""
  611. if d2:
  612. d2 = Decimal(d2[1:]) / Decimal(str(int(10 ** len(d2[1:]))))
  613. # print('d1, d2', d1, d2)
  614. d1 += d2
  615. digit = d1
  616. # print('digit', digit)
  617. # 确定中文倍数
  618. _proportion2 = re.sub(re.escape(match.group()), '', _proportion)
  619. match = re.search('[十百千万亿]+', _proportion2)
  620. _dict = {'十': 10, '百': 100, '千': 1000, '万': 10000, '亿': 100000000}
  621. if match:
  622. for c in match.group():
  623. multiple_cnt *= _dict.get(c)
  624. _proportion3 = re.sub(re.escape(match.group()), '', _proportion2)
  625. else:
  626. _proportion3 = _proportion2
  627. # print('multiple_cnt2', multiple_cnt)
  628. # 确定面积/长度
  629. match = re.search('[平方㎡顷亩]+|[mM]2', _proportion3)
  630. if match:
  631. unit = '㎡'
  632. else:
  633. unit = 'm'
  634. # 确定单位倍数
  635. match = re.search('[平方kK千万公㎡mM米里顷亩]+2?', _proportion3)
  636. if match:
  637. if unit == 'm':
  638. if re.search('[kK千公]', match.group()):
  639. multiple_cnt *= 1000
  640. elif re.search('[里]', match.group()):
  641. multiple_cnt *= Decimal(str(500))
  642. else:
  643. if '亩' in match.group():
  644. multiple_cnt *= Decimal(str(666.67))
  645. elif '顷' in match.group():
  646. multiple_cnt *= 10000
  647. elif re.search('千米|公里|k[mM㎡]', match.group()):
  648. multiple_cnt *= 1000000
  649. # print('multiple_cnt1', multiple_cnt)
  650. # 拼接
  651. digit = str(digit * multiple_cnt) + unit
  652. return _proportion, digit
  653. def extract_has_elevator(content):
  654. reg = '电梯'
  655. match = re.search(reg, content)
  656. has_flag = 0
  657. if match:
  658. start_index, end_index = match.span()
  659. has_flag = 1
  660. if judge_yeji(match.span()[0], content):
  661. has_flag = 0
  662. elif re.search('公司', content[end_index:end_index+8]):
  663. has_flag = 0
  664. return has_flag
  665. def extract_project_property(content, property_pattern, property_priority_dict):
  666. property_list = []
  667. for m in re.finditer(property_pattern, content):
  668. for k,v in m.groupdict().items():
  669. if v is not None:
  670. property_list.append([k, property_priority_dict.get(k)])
  671. _property = '新建'
  672. if len(property_list)>0:
  673. property_list.sort(key=lambda x: x[1])
  674. _property = property_list[0][0]
  675. return _property
  676. def extract_several_money(list_sentence, html='', is_obj=True):
  677. money_type_list = ['总投资', '建安费', '工程造价']
  678. money_list = []
  679. all_before_sentence = ''
  680. for i, sentence in enumerate(list_sentence):
  681. last_text = ''
  682. next_text = ''
  683. if is_obj:
  684. text = sentence.sentence_text
  685. all_before_sentence += text
  686. if i > 0:
  687. last_text = list_sentence[i-1].sentence_text[-30:]
  688. if i < len(list_sentence) - 1:
  689. next_text = list_sentence[i+1].sentence_text[:30]
  690. else:
  691. text = sentence
  692. all_before_sentence += text
  693. if i > 0:
  694. last_text = list_sentence[i-1][-30:]
  695. if i < len(list_sentence) - 1:
  696. next_text = list_sentence[i+1][:30]
  697. if judge_yeji(len(all_before_sentence), all_before_sentence, 300+len(text)):
  698. # print('sentence yeji before ' + text)
  699. continue
  700. # if '项目概算总投资为' in text:
  701. _list, _ = get_several_money(text, 0, False, html=html)
  702. # logging.info('get_several_money _list ' + str(_list))
  703. temp_list = []
  704. for l in _list:
  705. if l[-1] == '总投资':
  706. if re.search('业绩', last_text+text+next_text):
  707. continue
  708. temp_list.append(l)
  709. _list = temp_list
  710. money_list += _list
  711. # if money_list:
  712. # break
  713. money_type_dict = {}
  714. for money, _, _, _, money_type in money_list:
  715. for _type in money_type_list:
  716. if _type != money_type:
  717. continue
  718. # 科学计数法
  719. try:
  720. if 'E+' in money:
  721. times = int(money.split('E+')[-1])
  722. _money = float(float(money.split('E+')[0]) * (10 ** times))
  723. else:
  724. _money = float(money)
  725. except:
  726. continue
  727. if _type in money_type_dict.keys():
  728. money_type_dict[_type] += [_money]
  729. else:
  730. money_type_dict[_type] = [_money]
  731. # logging.info('money_type_dict ' + str(money_type_dict))
  732. result_list = []
  733. for _type in money_type_list:
  734. if money_type_dict.get(_type):
  735. if _type == '建安费':
  736. temp_list = money_type_dict.get(_type)
  737. temp_list = list(set(temp_list))
  738. money = 0
  739. for m in temp_list:
  740. money += m
  741. result_list.append(money)
  742. else:
  743. result_list.append(money_type_dict.get(_type)[0])
  744. else:
  745. result_list.append(None)
  746. for i in range(len(result_list)):
  747. if result_list[i] is None:
  748. result_list[i] = 0
  749. result_list[i] = float(result_list[i])
  750. return result_list
  751. def extract_max_floor(content, html=None):
  752. def match_floor(_reg, _content, _reg2=None, _tables_and_divs=None):
  753. _match = re.finditer(_reg, _content)
  754. _floor_list = []
  755. for m in _match:
  756. if 'reg6' in _reg:
  757. _floor1 = content[max(0, m.span('reg6')[0]-1):m.span('reg6')[1]+1]
  758. elif 'reg4' in _reg:
  759. _floor1 = content[max(0, m.span('reg4')[0]-1):m.span('reg4')[1]+1]
  760. else:
  761. _floor1 = content[m.span()[0]:m.span()[1]]
  762. if judge_yeji(m.span()[0], _content, 300, _tables_and_divs, _floor1):
  763. continue
  764. if 'reg6' in _reg:
  765. _floor = m.group('reg6')
  766. _floor = re.split('[-~~]', _floor)
  767. _floor = max(int(_floor[0]), int(_floor[1]))
  768. elif 'reg4' in _reg:
  769. _floors = re.findall('\d+', m.group())
  770. _floors = [int(x) for x in _floors]
  771. _floors1 = re.findall('[一二两三四五六七八九十]+', m.group())
  772. _floors1 = [chinese_to_arabic(x) for x in _floors1]
  773. _floor = max(_floors + _floors1)
  774. elif '-' in m.group():
  775. _floor = ''.join(re.findall('\d+-\d+', m.group()))
  776. if len(_floor) < 1:
  777. continue
  778. _floor = _floor.split('-')
  779. _floor = max(int(_floor[0]), int(_floor[1]))
  780. elif '/' in m.group():
  781. _floor = m.group()
  782. _floor = re.sub('层', '', _floor)
  783. _floor = ''.join(re.findall('\d+/\d+', _floor))
  784. # print('@1', _floor)
  785. if len(_floor) < 1:
  786. continue
  787. _floor = _floor.split('/')
  788. _floor = max(int(_floor[0]), int(_floor[1]))
  789. else:
  790. _floor = ''.join(re.findall('\d+', m.group()))
  791. if len(_floor) < 1:
  792. _floor = ''.join(re.findall('[一二两三四五六七八九十]+', m.group()))
  793. if len(_floor) < 1:
  794. continue
  795. _floor = chinese_to_arabic(_floor)
  796. _floor = int(_floor)
  797. if _reg2:
  798. _floor_list2 = match_floor(_reg2, _content[m.span()[1]:m.span()[1]+35])
  799. # print('@2', _floor_list2)
  800. if _floor_list2:
  801. _floor2 = int(_floor_list2[0])
  802. _floor = _floor + _floor2
  803. _floor_list.append(_floor)
  804. return _floor_list
  805. reg = '(建筑|)(物|)(层数最大|最大层数|最高层|总层数|最大层|层数)[共为::]?(\d{1,3}|\d{1,3}层?/\d{1,3}|[一两二三四五六七八九十]{1,3})[层Ff]'
  806. reg0 = '局部(建筑|)(层数|)[共为]?(\d{1,3}|[一二两三四五六七八九十]{1,3})层'
  807. reg1 = '地上(建筑|)(层数|)[共为]?(\d{1,3}层?/\d{1,3}|\d{1,3}-\d{1,3}|\d{1,3}|[一二两三四五六七八九十]{1,3})层'
  808. reg2 = '地下(建筑|)(层数|)[共为]?(\d{1,3}层?/\d{1,3}|\d{1,3}-\d{1,3}|\d{1,3}|[一二两三四五六七八九十]{1,3})层'
  809. reg3 = '[到至]\d{1,3}层'
  810. reg4 = '层数分别.{1,20}(?P<reg4>(\d{1,3}|[一二两三四五六七八九十]{1,3}))层'
  811. reg5 = '共(\d{1,3}|[一二两三四五六七八九十]{1,3})层'
  812. reg6 = '地上.{1,10}(?P<reg6>\d{1,3}[-~~]\d{1,3})层'
  813. if html:
  814. soup = BeautifulSoup(html, 'lxml')
  815. tables_and_divs = soup.find_all(['table', 'div', 'p'])
  816. else:
  817. tables_and_divs = []
  818. floor_list = []
  819. # 常规
  820. floor_list += match_floor(reg, content, _tables_and_divs=tables_and_divs)
  821. # 局部
  822. floor_list += match_floor(reg0, content, _tables_and_divs=tables_and_divs)
  823. # x层到x层
  824. floor_list += match_floor(reg3, content, _tables_and_divs=tables_and_divs)
  825. # 地上地下
  826. floor_list += match_floor(reg1, content, _reg2=reg2, _tables_and_divs=tables_and_divs)
  827. floor_list += match_floor(reg2, content, _reg2=reg1, _tables_and_divs=tables_and_divs)
  828. # 层数分别为... xx层
  829. floor_list += match_floor(reg4, content, _tables_and_divs=tables_and_divs)
  830. # 共xx层
  831. floor_list += match_floor(reg5, content, _tables_and_divs=tables_and_divs)
  832. # 地上... xx~xx层
  833. floor_list += match_floor(reg6, content, _reg2=reg2, _tables_and_divs=tables_and_divs)
  834. if floor_list:
  835. floor_list.sort(key=lambda x: x)
  836. floor = floor_list[-1]
  837. if floor <= 0:
  838. return None
  839. else:
  840. return floor_list[-1]
  841. else:
  842. return None
  843. def extract_structure(content, html=None, structure_keyword_list=None):
  844. # reg = '框架结构|钢框架结构|混凝土框架结构|剪力墙结构|框架-剪力墙结构|框架+剪力墙结构|框架和剪力墙结构|框架及剪力墙结构|混凝土剪力墙结构|筒体结构|桅式结构|墙板结构|膜结构|悬索结构|板柱结构|充气结构|网架结构|壳体结构|拱形结构|穹顶结构|混凝土结构|钢筋混凝土框架结构|钢筋混凝土筒仓结构|钢结构|砌体结构|木结构|砖混结构|排架结构|束筒结构|薄壳结构|钢混结构|砖木结构|砌体结构|钢砼结构|框剪结构|钢筋混凝土框架结构|筒中筒结构|框筒结构|桁架结构|拱券结构|钢筋混凝土结构|框架核心筒结构|门式钢架结构|门钢结构|轻钢结构|钢-混凝土框架结构|木框架结构|空间网格结构|框架筒体结构|砖拱结构|钢筋砼结构|核心筒结构|框架-核心筒结构'
  845. reg1 = '(结构(楼层|)(形式|类型|类别|体系|结构)[为是::])([^结]{2,8}结构)'
  846. reg = '|'.join([x[:-1] for x in structure_keyword_list])
  847. reg = reg_word_sort(reg)
  848. # logging.info(reg)
  849. if html:
  850. soup = BeautifulSoup(html, 'lxml')
  851. tables_and_divs = soup.find_all(['table', 'div', 'p'])
  852. else:
  853. tables_and_divs = []
  854. # match = re.finditer(reg1, content)
  855. # structure_list = []
  856. # for m in match:
  857. # structure = m.group(4)
  858. # structure1 = content[max(0, m.span(4)[0]-1):m.span(4)[1]+1]
  859. # if judge_yeji(m.span()[0], content, 300, tables_and_divs, structure1):
  860. # continue
  861. # structure_list.append(structure)
  862. # if structure_list:
  863. # structure_list = list(set(structure_list))
  864. # structure_list.sort(key=lambda x: x)
  865. # return ','.join(structure_list)
  866. # else:
  867. structure_list = []
  868. match = re.finditer(reg, content)
  869. for m in match:
  870. structure = m.group()
  871. structure1 = content[max(0, m.span()[0]-1):m.span()[1]+1]
  872. if judge_yeji(m.span()[0], content, 300, tables_and_divs, structure1):
  873. continue
  874. if structure in ['钢结构']:
  875. if re.search('公司', content[m.span()[1]:m.span()[1]+8]):
  876. continue
  877. structure_list.append(structure)
  878. if structure_list:
  879. structure_list = list(set(structure_list))
  880. structure_list.sort(key=lambda x: x)
  881. return ','.join(structure_list)
  882. else:
  883. return None
  884. def extract_has_steel_structure(content):
  885. """
  886. :param content: 传入已提取完的structure
  887. :return:
  888. """
  889. if not content:
  890. return 0
  891. reg = '钢结构|门式钢架结构|钢框架结构|钢桁架结构|钢网架结构|钢框结构钢架结构|钢骨架结构|钢骨架式结构|钢管桁架结构|轻钢骨架结构|钢桁架式结构|轻钢门架结构|轻钢门式结构|门式钢屋架结构|钢桁架框架结构|钢框架—支撑结构|钢桁梁结构|网架结构|网壳结构|索膜结构|塔桅结构'
  892. match = re.search(reg, content)
  893. has_flag = 0
  894. if match:
  895. has_flag = 1
  896. return has_flag
  897. def extract_wall_type(doctitle, content):
  898. reg1 = '(幕墙|外墙)'
  899. wall_class_dict = {
  900. '玻璃幕墙': '玻璃|玻璃砖',
  901. '金属幕墙': '铝合金|铝单板|仿木纹铝单板|夹芯保温铝板|铝复合板|蜂窝铝板|仿石材铝板|铝板|彩钢板|不锈钢板|彩涂钢板|珐琅钢板|钛合金板|铝合金装饰网格|镀锌钢网|铜合金|镀层钢板|锌板|钛板',
  902. '陶土板幕墙': '陶土板',
  903. '涂料': '涂料|乳胶漆|弹性涂料|质感涂料|真石漆|水包水|水包砂|岩片漆|金属漆|氟碳漆|仿石涂料|封闭底漆|腻子|罩光清漆|柔性耐水腻子|罩面漆|仿瓷涂料|有机硅丙烯酸涂料|氟碳树脂涂料|硅溶胶外墙涂料|无机纤维喷涂',
  904. '石材': '石材|花岗岩|大理石|砂岩|板岩|岩棉板|岩板',
  905. '瓷砖': '陶瓷砖|劈开砖|釉面砖|通体砖|抛光砖|马赛克砖|陶砖|玻璃砖',
  906. '混凝土': '钢筋混凝土|现浇混凝土|预制混凝土|混凝土砌块|蒸压多孔混凝土|纤维混凝土|UHPC板|UHPC|UHPC装饰板|超高性能混凝土|清水混凝土|混凝土|加气混凝土砌块',
  907. '复合墙板': '复合外墙板|复合墙板|GRC|金属面岩棉板|复合保温幕墙板|金属面复合保温幕墙|金属面岩棉夹心外墙板|夹心保温外墙|金属面岩棉装饰节能一体板|PU板幕墙|保温装饰一体板|保温一体板|岩棉保温装饰一体板|EPS保温板|岩棉保温板|STP保温板|酚醛保温板|聚氨酯保温板|钛锌塑铝复合板|自保温板外墙板|岩棉夹心板外墙板|玻璃棉|EPS板|XPS板|聚氨酯外墙保温板|外墙保温板|保温岩棉复合板|水泥发泡保温板|发泡陶瓷保温板|石墨改性水泥基保温板|挤塑聚苯板|挤塑板|聚苯乙烯板|玻化微珠岩棉板|聚苯板|聚氨酯板|热固复合聚苯乙烯泡沫保温板',
  908. '水泥': '白水泥|白色硅酸盐水泥',
  909. '文化砖': '文化砖',
  910. '木材': 'LVL|单板层积材|高压热固化木纤维板',
  911. '陶棍幕墙': '陶棍幕墙',
  912. '风动幕墙': '风铃幕墙|风动幕墙',
  913. '光电幕墙': '光伏幕墙|光电幕墙',
  914. '织物幕墙': '织物幕墙',
  915. '生态幕墙': '生态幕墙|气循环幕墙|呼吸幕墙|绿色幕墙|健康幕墙',
  916. }
  917. for key in wall_class_dict.keys():
  918. reg = wall_class_dict[key]
  919. reg = reg_word_sort(reg)
  920. wall_class_dict[key] = '(' + reg + ')'
  921. wall_list = []
  922. wall_list2 = []
  923. match = re.search(reg1, doctitle)
  924. if match:
  925. for first_class in wall_class_dict.keys():
  926. reg2 = wall_class_dict.get(first_class)
  927. if first_class in ['陶棍幕墙', '风动幕墙', '光电幕墙', '织物幕墙', '生态幕墙']:
  928. continue
  929. reg2 = reg2 + '(幕墙|外墙|)'
  930. match1 = re.search(reg2, content)
  931. if match1:
  932. wall_list.append(first_class)
  933. wall_list2.append(match1.group())
  934. for first_class in wall_class_dict.keys():
  935. reg2 = wall_class_dict.get(first_class)
  936. # 这5类特殊,只要存在就直接提取
  937. if first_class in ['陶棍幕墙', '风动幕墙', '光电幕墙', '织物幕墙', '生态幕墙']:
  938. match = re.search(reg2, content)
  939. if match:
  940. wall_list.append(first_class)
  941. wall_list2.append(match.group())
  942. # 其他类使用两种规则
  943. else:
  944. reg3 = reg1 + '.{0,10}' + reg2
  945. reg4 = reg2 + reg1
  946. match = re.search(reg3, content)
  947. if match:
  948. wall_list.append(first_class)
  949. match1 = re.search(reg2, match.group())
  950. if match1:
  951. wall_list2.append(match1.group())
  952. match = re.search(reg4, content)
  953. if match:
  954. wall_list.append(first_class)
  955. wall_list2.append(match.group())
  956. if wall_list:
  957. wall_list = list(set(wall_list))
  958. wall_list.sort(key=lambda x: x)
  959. wall_list = ','.join(wall_list)
  960. else:
  961. wall_list = None
  962. if wall_list2:
  963. wall_list2 = list(set(wall_list2))
  964. wall_list2.sort(key=lambda x: x)
  965. wall_list2 = ','.join(wall_list2)
  966. else:
  967. wall_list2 = None
  968. return wall_list, wall_list2
  969. def cut_win_bid_part(_str):
  970. """
  971. 截掉项目概述里的中标相关信息
  972. """
  973. origin_str = _str
  974. reg_list = [
  975. "(评标|中标|中选)(结果|报告)(公示|)[::,,]",
  976. "第[一二三四1-9]名",
  977. "中标候选人(基本情况|附件)",
  978. "(排序|第[一二三1-3]|推荐)中标候选人|中标(候选人|结果)(排序|公示|信息)"
  979. ]
  980. for reg in reg_list:
  981. match = re.finditer(reg, _str)
  982. _index = len(origin_str)
  983. for m in match:
  984. _index = m.span()[0]
  985. break
  986. if _index == len(origin_str):
  987. continue
  988. match = re.finditer("[,,。;;]", _str[:_index])
  989. _index = 0
  990. for m in match:
  991. _index = m.span()[0]
  992. _str = _str[:_index]
  993. match = re.finditer("中[标选]|投标(人|企业|单位)|乙方|供应商|成交", _str)
  994. for m in match:
  995. index_start = m.span()[0]
  996. cut_str = re.split("[,,。;;]", _str[index_start:])[0]
  997. if len(cut_str) < 25:
  998. cut_str = _str[index_start:index_start+25]
  999. # cut_str = _str[index_start:index_start+15]
  1000. # print("cut_str", cut_str)
  1001. match2 = re.search("(¥|)[\d,.]+[万元亿]|公司|业绩|最终得分", cut_str)
  1002. # print("match2", match2)
  1003. if match2:
  1004. # 排除混淆
  1005. match3 = re.search("中标法|获取招标文件|交接|支付|账户|结算|限价|承担|服务费|\*|×", cut_str)
  1006. # print("match3", match3)
  1007. if not match3:
  1008. match4 = re.finditer("[,,。;;]", _str[:index_start])
  1009. index_stop = 0
  1010. for m4 in match4:
  1011. index_stop = m4.span()[0]
  1012. _str = _str[:index_stop]
  1013. break
  1014. # print(_str)
  1015. return _str
  1016. def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=None):
  1017. def is_yeji_table(_tables_and_divs, _entity_text):
  1018. if not _tables_and_divs:
  1019. return 0
  1020. is_yeji = 0
  1021. reg_yeji = '业绩|选取原因|奖项|获奖|供应商信息|近年完成|中标单位信息|评标情况|类似项目|资质|候选人情况'
  1022. for index3, obj in enumerate(_tables_and_divs):
  1023. # if '<tr' not in str(obj):
  1024. if obj.find('tr'):
  1025. continue
  1026. _table = obj
  1027. _table_text = str(_table.get_text())
  1028. # _m = re.search(re.escape(_entity_text), str(_table))
  1029. _m = re.search(re.escape(_entity_text), _table_text)
  1030. if not _m:
  1031. _entity_text = _entity_text[1:]
  1032. _m = re.search(re.escape(_entity_text[1:]), _table_text)
  1033. if not _m:
  1034. _entity_text = _entity_text[:-1]
  1035. _m = re.search(re.escape(_entity_text[:-1]), _table_text)
  1036. # print('_entity_text', _entity_text)
  1037. if _m:
  1038. rows = _table.find_all('tr')
  1039. max_col_span = 0
  1040. for row in rows:
  1041. if len(row.find_all('td')) > max_col_span:
  1042. max_col_span = len(row.find_all('td'))
  1043. # print('max_col_span', max_col_span)
  1044. for index, row in enumerate(rows):
  1045. if re.search(re.escape(_entity_text), str(row.get_text())):
  1046. cells = row.find_all('td')
  1047. if len(cells) == 0:
  1048. continue
  1049. # print('cells', cells[0])
  1050. # 第一列是否含业绩
  1051. # if re.search(reg_yeji, str(cells[0])):
  1052. if re.search(reg_yeji, str(cells[0].get_text())):
  1053. # logging.info('is_yeji_table 1')
  1054. is_yeji = 1
  1055. # 前面几行是否有占多列的有业绩
  1056. else:
  1057. if index > 0:
  1058. for row2 in rows[:index][::-1]:
  1059. # print('len(rows[index2])', len(row2.find_all('td')))
  1060. # if len(row2.find_all('td')) <= max_col_span / 2:
  1061. # print(re.search('业绩', str(row2)), str(row2))
  1062. if len(row2.find_all('td')) <= max_col_span / 2 and re.search(reg_yeji, str(row2.get_text())):
  1063. # logging.info('is_yeji_table 2')
  1064. is_yeji = 1
  1065. break
  1066. # 前面都找不到,那么找表格上方的两行
  1067. div_list = [str(x.get_text()) for x in _tables_and_divs[max(0, index3-2):index3]]
  1068. if not is_yeji and re.search(reg_yeji, ' '.join(div_list)):
  1069. # logging.info('is_yeji_table 3')
  1070. is_yeji = 1
  1071. break
  1072. return is_yeji
  1073. # 先判断表格业绩
  1074. if tables_and_divs:
  1075. yeji_table_flag = is_yeji_table(tables_and_divs, entity)
  1076. if yeji_table_flag:
  1077. # logging.info('yeji_table_flag 1')
  1078. return yeji_table_flag
  1079. if len(content) == 0:
  1080. return 0
  1081. if end_index == 0:
  1082. return 0
  1083. reg_yeji = '业绩|选取原因|奖项|获奖|供应商信息|近年完成'
  1084. if re.search(reg_yeji, content[:end_index][-judge_len:]):
  1085. # if '业绩' in content[:end_index][-judge_len:]:
  1086. return 1
  1087. else:
  1088. return 0
  1089. def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
  1090. def getDigitsDic(_unit):
  1091. '''
  1092. @summary:拿到中文对应的数字
  1093. '''
  1094. DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
  1095. "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9}
  1096. return DigitsDic.get(_unit)
  1097. def getMultipleFactor(_unit):
  1098. '''
  1099. @summary:拿到单位对应的值
  1100. '''
  1101. MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"圆":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)}
  1102. return MultipleFactor.get(_unit)
  1103. def getUnifyMoney(money):
  1104. '''
  1105. @summary:将中文金额字符串转换为数字金额
  1106. @param:
  1107. money:中文金额字符串
  1108. @return: decimal,数据金额
  1109. '''
  1110. MAX_MONEY = 1000000000000
  1111. MAX_NUM = 12
  1112. #去掉逗号
  1113. money = re.sub("[,,]","",money)
  1114. money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",money)
  1115. result = Decimal(0)
  1116. chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
  1117. # chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","圆","元","角","分"]
  1118. chnFactorUnits = ["圆", "元","兆", "亿", "万", "仟", "佰", "拾", "角", "分", '十', '百', '千']
  1119. LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$")
  1120. BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$"%("".join(chnDigits)))
  1121. try:
  1122. if re.search(LowMoneypattern,money) is not None:
  1123. return Decimal(money)
  1124. elif re.search(BigMoneypattern,money) is not None:
  1125. return getDigitsDic(re.search(BigMoneypattern,money).group("BigMoney"))
  1126. for factorUnit in chnFactorUnits:
  1127. if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None:
  1128. subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money)
  1129. if re.search(re.compile("^(\d+)(\.\d+)?$"),subMoneys[0]) is not None:
  1130. if MAX_MONEY/getMultipleFactor(factorUnit)<Decimal(subMoneys[0]):
  1131. return Decimal(0)
  1132. result += Decimal(subMoneys[0])*(getMultipleFactor(factorUnit))
  1133. elif len(subMoneys[0])==1:
  1134. if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None:
  1135. result += Decimal(getDigitsDic(subMoneys[0]))*(getMultipleFactor(factorUnit))
  1136. # subMoneys[0]中无金额单位,不可再拆分
  1137. elif subMoneys[0]=="":
  1138. result += 0
  1139. elif re.search(re.compile("[%s]"%("".join(chnFactorUnits))),subMoneys[0]) is None:
  1140. # print(subMoneys)
  1141. # subMoneys[0] = subMoneys[0][0]
  1142. result += Decimal(getUnifyMoney(subMoneys[0])) * (getMultipleFactor(factorUnit))
  1143. else:
  1144. result += Decimal(getUnifyMoney(subMoneys[0]))*(getMultipleFactor(factorUnit))
  1145. if len(subMoneys)>1:
  1146. if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None:
  1147. result += Decimal(subMoneys[1])
  1148. elif len(subMoneys[1])==1:
  1149. if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None:
  1150. result += Decimal(getDigitsDic(subMoneys[1]))
  1151. else:
  1152. result += Decimal(getUnifyMoney(subMoneys[1]))
  1153. break
  1154. except Exception as e:
  1155. # traceback.print_exc()
  1156. return Decimal(0)
  1157. return result
  1158. # 提取表格用于判断业绩
  1159. if html:
  1160. soup = BeautifulSoup(html, 'lxml')
  1161. tables = soup.find_all('table')
  1162. tables_and_divs = soup.find_all(['table', 'div'])
  1163. else:
  1164. tables_and_divs = []
  1165. money_list = []
  1166. # 使用正则识别金额
  1167. entity_type = "money"
  1168. list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
  1169. "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果)(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?P<science_key_word>(E-?\d+))?[百千]{,1})(?:[(\(]?(?P<filter_>[%%‰折])*\s*,?((金额)?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
  1170. "front_m": "((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?P<science_front_m>(E-?\d+))?(?:,?)[百千]*)())",
  1171. "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?P<science_behind_m>(E-?\d+))?(?:,?)[百千]*)(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
  1172. # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
  1173. pattern_money = re.compile("%s|%s|%s|%s" % (
  1174. list_money_pattern["cn"], list_money_pattern["key_word"], list_money_pattern["behind_m"],
  1175. list_money_pattern["front_m"]))
  1176. # 修复 元千万元
  1177. match = re.search('元[千百十]万元', sentence_text)
  1178. if match:
  1179. sentence_text = re.sub(re.escape(match.group()), match.group()[1:], sentence_text)
  1180. # 修复 千多万元
  1181. match = re.search('[千百十]多万元', sentence_text)
  1182. if match:
  1183. sentence_text = re.sub(re.escape(match.group()), match.group()[0] + match.group()[2:], sentence_text)
  1184. if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
  1185. found_yeji += 1
  1186. if found_yeji >= 2: # 过滤掉业绩后面的所有金额
  1187. all_match = []
  1188. else:
  1189. ser = re.search('((收费标准|计算[方公]?式):|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s()()\[\]【】\d\.%%‰\+\-*×/]{20,}[,。]?', sentence_text) # 过滤掉收费标准里面的金额
  1190. if ser:
  1191. all_match = re.finditer(pattern_money, sentence_text.replace(ser.group(0), ' ' * len(ser.group(0))))
  1192. else:
  1193. all_match = re.finditer(pattern_money, sentence_text)
  1194. for _match in all_match:
  1195. if len(_match.group()) > 0:
  1196. notes = '' # 2021/7/20 新增备注金额大写或金额单位 if 金额大写 notes=大写 elif 单位 notes=单位
  1197. unit = ""
  1198. entity_text = ""
  1199. start_index = ""
  1200. end_index = ""
  1201. text_beforeMoney = ""
  1202. filter = ""
  1203. filter_unit = False
  1204. notSure = False
  1205. science = ""
  1206. if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text[:_match.span()[0]]): # 2021/7/21过滤掉业绩后面金额
  1207. # print('金额在业绩后面: ', _match.group(0))
  1208. found_yeji += 1
  1209. break
  1210. yeji_table_flag = 0
  1211. for k, v in _match.groupdict().items():
  1212. if v != "" and v is not None:
  1213. if k == 'text_key_word':
  1214. notSure = True
  1215. if k.split("_")[0] == "money":
  1216. entity_text = v
  1217. # if is_yeji_table(tables_and_divs, entity_text):
  1218. if judge_yeji(len(sentence_text), sentence_text, 300, tables_and_divs, entity_text):
  1219. yeji_table_flag = 1
  1220. break
  1221. # print(_match.group(k), 'entity_text: ', sentence_text[_match.start(k): _match.end(k)])
  1222. if entity_text.endswith(',00'): # 金额逗号后面不可能为两个0结尾,应该小数点识别错,直接去掉
  1223. entity_text = entity_text[:-3]
  1224. if k.split("_")[0] == "unit":
  1225. if v == '万元' or unit == "": # 处理 预算金额(元):160万元 这种出现前后单位不一致情况
  1226. unit = v
  1227. if k.split("_")[0] == "text":
  1228. # print('text_before: ', _match.group(k))
  1229. text_beforeMoney = v
  1230. if k.split("_")[0] == "filter":
  1231. filter = v
  1232. if re.search("filter_unit", k) is not None:
  1233. filter_unit = True
  1234. if k.split("_")[0] == 'science':
  1235. science = v
  1236. if yeji_table_flag:
  1237. continue
  1238. if filter != "":
  1239. continue
  1240. start_index, end_index = _match.span()
  1241. start_index += len(text_beforeMoney)
  1242. '''过滤掉手机号码作为金额'''
  1243. if re.search('电话|手机|联系|方式|编号|编码|日期|数字|时间', text_beforeMoney):
  1244. # print('过滤掉手机号码作为金额')
  1245. continue
  1246. elif re.search('^1[3-9]\d{9}$', entity_text) and re.search(':\w{1,3}$', text_beforeMoney): # 过滤掉类似 '13863441880', '金额(万元):季勇13863441880'
  1247. # print('过滤掉手机号码作为金额')
  1248. continue
  1249. if unit == "": # 2021/7/21 有明显金额特征的补充单位,避免被过滤
  1250. if (re.search('(¥|¥|RMB|CNY)[::]?$', text_beforeMoney) or re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', entity_text)):
  1251. if entity_text.endswith('万元'):
  1252. unit = '万元'
  1253. entity_text = entity_text[:-2]
  1254. else:
  1255. unit = '元'
  1256. # print('1明显金额特征补充单位 元')
  1257. elif re.search('USD[::]?$', text_beforeMoney):
  1258. unit = '美元'
  1259. elif re.search('EUR[::]?$', text_beforeMoney):
  1260. unit = '欧元'
  1261. elif re.search('JPY[::]?$', text_beforeMoney):
  1262. unit = '日元'
  1263. elif re.search('^[-—]+[\d,.]+万元', sentence_text[end_index:]):
  1264. # print('两个金额连接后面的有单位,用后面单位')
  1265. unit = '万元'
  1266. elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)[::为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:
  1267. if re.search('^[\d,,.]+$', entity_text) and float(re.sub('[,,]', '', entity_text))<500 and re.search('万元', sentence_text):
  1268. unit = '万元'
  1269. # print('金额较小且句子中有万元的,补充单位为万元')
  1270. elif re.search('^\d{1,3}\.\d{4,6}$', entity_text) and re.search('0000$', entity_text) == None:
  1271. unit = '万元'
  1272. else:
  1273. unit = '元'
  1274. # print('金额前面紧接关键词的补充单位 元')
  1275. elif re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7},?)$)|(^\d{,3}(,\d{3})+,?$)', entity_text):
  1276. unit = '元'
  1277. # print('3明显金额特征补充单位 元')
  1278. else:
  1279. # print('过滤掉没单位金额: ',entity_text)
  1280. continue
  1281. elif unit == '万元':
  1282. if end_index < len(sentence_text) and sentence_text[end_index] == '元' and re.search('\d$', entity_text):
  1283. unit = '元'
  1284. elif re.search('^[5-9]\d{6,}\.\d{2}$', entity_text): # 五百亿以上的万元改为元
  1285. unit = '元'
  1286. if unit.find("万") >= 0 and entity_text.find("万") >= 0: # 2021/7/19修改为金额文本有万,不计算单位
  1287. # print('修正金额及单位都有万, 金额:',entity_text, '单位:',unit)
  1288. unit = "元"
  1289. if re.search('.*万元万元', entity_text): # 2021/7/19 修正两个万元
  1290. # print(' 修正两个万元',entity_text)
  1291. entity_text = entity_text.replace('万元万元', '万元')
  1292. else:
  1293. if filter_unit:
  1294. continue
  1295. entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", entity_text)
  1296. # print('转换前金额:', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
  1297. if re.search('总投资|投资总额|总预算|总概算|投资规模|批复概算|投资额|投资估算|投资概算',
  1298. sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]): # 2021/8/5过滤掉总投资金额
  1299. # print('总投资金额: ', _match.group(0))
  1300. notes = '总投资'
  1301. elif re.search('建安费|建安工程费|建筑安装工程费|建安工程造价',
  1302. sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]): # 2021/11/18 投资金额不作为招标金额
  1303. notes = '建安费'
  1304. elif re.search('总工程造价|总造价',
  1305. sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]): # 2021/12/20 工程造价不作为招标金额
  1306. notes = '工程造价'
  1307. # 直接判断前面是否有业绩
  1308. if judge_yeji(max(0, _match.span()[0] - 10), sentence_text):
  1309. continue
  1310. if len(unit) > 0:
  1311. if unit.find('万') >= 0 and len(entity_text.split('.')[0]) >= 8: # 2021/7/19 修正万元金额过大的情况
  1312. entity_text = str(
  1313. getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]) / 10000)
  1314. unit = '元' # 修正金额后单位 重置为元
  1315. else:
  1316. entity_text = str(getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]))
  1317. else:
  1318. if entity_text.find('万') >= 0 and entity_text.split('.')[0].isdigit() and len(
  1319. entity_text.split('.')[0]) >= 8:
  1320. entity_text = str(getUnifyMoney(entity_text) / 10000)
  1321. # print('修正金额字段含万 过大的情况')
  1322. else:
  1323. entity_text = str(getUnifyMoney(entity_text))
  1324. if science and re.search('^E-?\d+$', science): # 科学计数
  1325. entity_text = str(Decimal(entity_text + science)) if Decimal(entity_text + science) > 100 and Decimal(
  1326. entity_text + science) < 10000000000 else entity_text # 结果大于100及小于100万才使用科学计算
  1327. if float(entity_text) > 100000000000: # float(entity_text)<100 or 2022/3/4 取消最小金额限制
  1328. continue
  1329. if notSure and unit == "" and float(entity_text) > 100 * 10000:
  1330. # print('过滤掉金额 notSure and unit=="" and float(entity_text)>100*10000:', entity_text, unit)
  1331. continue
  1332. # print("金额:{0} ,单位:{1}, 前文:{2}, filter: {3}, filter_unit: {4}".format(entity_text, unit, text_beforeMoney,
  1333. # filter, filter_unit))
  1334. if re.search('[%%‰折]|费率|下浮率', text_beforeMoney) and float(entity_text)<1000: # 过滤掉可能是费率的金额
  1335. # print('过滤掉可能是费率的金额')
  1336. continue
  1337. money_list.append((entity_text, start_index, end_index, unit, notes))
  1338. # 排除过小的金额
  1339. temp_list = []
  1340. for money in money_list:
  1341. if int(float(money[0])) < 1000:
  1342. continue
  1343. temp_list.append(money)
  1344. money_list = temp_list
  1345. return money_list, found_yeji
  1346. def chinese_to_arabic(ch_str):
  1347. chinese_number_dict = {
  1348. '一': 1,
  1349. '二': 2,
  1350. '两': 2,
  1351. '三': 3,
  1352. '四': 4,
  1353. '五': 5,
  1354. '六': 6,
  1355. '七': 7,
  1356. '八': 8,
  1357. '九': 9,
  1358. '十': 10,
  1359. '拾': 10,
  1360. '百': 100,
  1361. '千': 1000,
  1362. }
  1363. no_list = []
  1364. for c in ch_str:
  1365. if c not in chinese_number_dict.keys():
  1366. return None
  1367. no_list.append(chinese_number_dict.get(c))
  1368. arabic_num = 0
  1369. mul_no = None
  1370. for i, no in enumerate(no_list):
  1371. if no in [10, 100, 1000]:
  1372. if mul_no is None:
  1373. arabic_num += no
  1374. else:
  1375. arabic_num += no * mul_no
  1376. mul_no = None
  1377. else:
  1378. mul_no = no
  1379. if mul_no:
  1380. arabic_num += mul_no
  1381. return arabic_num
  1382. def reg_word_sort(reg):
  1383. ss = reg.split('|')
  1384. ss.sort(key=lambda x: len(x), reverse=True)
  1385. reg = '|'.join(ss)
  1386. return reg
  1387. def get_stage_pattern():
  1388. stage_dict = {
  1389. "立项阶段": "立项|项目投资",
  1390. "可研阶段": "可行性研究|可研",
  1391. "环评阶段": "环境评价|环境影响|环境评测|环评|(水保|环保|环境保护)(编制|验收|监测)",
  1392. "稳评阶段": "稳定风险|社会稳定|风险评估",
  1393. "咨询阶段": "(水影响|能源|交通影响|地质灾害|地址灾害|地震安全性|地震安全性|气象|雷击风险|安全|海洋|森林环境)(评[价估测])|水土保持|(水|交|灾|震|能|气|安|海|林)评",
  1394. "造价阶段": "(决算书|预算|结算|造价|决算)(编制|咨询)",
  1395. "设计阶段": "(施工图(纸|)|初步|项目|工程)(方案|)设计|测绘|规划设计",
  1396. "勘察阶段": "(勘察|勘查)设计|勘察技术|勘查|勘察",
  1397. "施工图审": "(施工图(纸|)|防雷|消防|人防)审查|施工图审",
  1398. "施工许可": "施工许可证",
  1399. "施工准备": "施工准备|监理|资格预审|资审",
  1400. "施工在建": "施工",
  1401. "竣工阶段": "竣工|验收",
  1402. "EPC总承包": "总承包|EPC"
  1403. }
  1404. stage_priority_dict = {
  1405. "立项阶段": 1,
  1406. "可研阶段": 3,
  1407. "环评阶段": 2,
  1408. "稳评阶段": 3,
  1409. "咨询阶段": 2,
  1410. "造价阶段": 2,
  1411. "设计阶段": 4,
  1412. "勘察阶段": 4,
  1413. "施工图审": 2,
  1414. "施工许可": 2,
  1415. "施工准备": 3,
  1416. "施工在建": 5,
  1417. "竣工阶段": 3,
  1418. "EPC总承包": 4
  1419. }
  1420. list_stage_v = []
  1421. for k,v in stage_dict.items():
  1422. list_stage_v.append("(?P<%s>%s)"%(k,v))
  1423. stage_pattern = "|".join(list_stage_v)
  1424. return stage_pattern, stage_priority_dict
  1425. def get_industry_pattern():
  1426. filename = os.path.abspath(os.path.dirname(__file__)) + "/proposedBuildingKeyword.xlsx"
  1427. df = pd.read_excel(filename)
  1428. dict_industry_keywords = {}
  1429. for _industry, _keyword in zip(df["类别"], df["关键词"]):
  1430. if _industry not in dict_industry_keywords:
  1431. dict_industry_keywords[_industry] = set()
  1432. dict_industry_keywords[_industry].add(_keyword)
  1433. list_industry_p = []
  1434. for k, v in dict_industry_keywords.items():
  1435. if len(v) > 0:
  1436. list_industry_p.append("(?P<%s>%s)" % (k, "|".join(list(v))))
  1437. # logging.info('get_industry_pattern ' + str(list_industry_p))
  1438. _pattern = re.compile("|".join(list_industry_p))
  1439. return _pattern
  1440. def get_property_pattern():
  1441. property_dict = {
  1442. "复合性质": "扩迁建|扩改建|扩翻建|改扩建|改翻建|迁扩建|迁改建|迁扩建|翻改建|翻扩建",
  1443. "迁建": "迁建|搬迁重建",
  1444. "扩建": "扩建|增建|加建|扩大",
  1445. "翻建": "翻建|拆除重建",
  1446. "改建": "改造|改建|技改|提升|改进|整改",
  1447. "装饰装修": "装修|室内装饰|室外装饰|装饰工程|装饰改造工程|装饰装修|装修装饰|幕墙工程|修缮|外墙翻新|翻新工程|整修|修补|装饰|维修",
  1448. "拆除": "拆除",
  1449. "恢复重建": "恢复重建|灾后重建",
  1450. '其他': '整治|修复|环境治理|更换'
  1451. }
  1452. property_priority_dict = {
  1453. "复合性质": 1,
  1454. "迁建": 2,
  1455. "扩建": 3,
  1456. "翻建": 4,
  1457. "改建": 5,
  1458. "装饰装修": 6,
  1459. "拆除": 7,
  1460. "恢复重建": 8,
  1461. '其他': 9,
  1462. "新建": 10
  1463. }
  1464. list_property_v = []
  1465. for k,v in property_dict.items():
  1466. list_property_v.append("(?P<%s>%s)"%(k,v))
  1467. property_pattern = "|".join(list_property_v)
  1468. return property_pattern, property_priority_dict