Quellcode durchsuchen

优化产品配置提取,预估召回率和准确率能达到85%以上,有值率25%左右

luojiehua vor 1 Jahr
Ursprung
Commit
f7ab637cbe

+ 6 - 4
BaseDataMaintenance/maintenance/dataflow.py

@@ -499,7 +499,7 @@ class Dataflow():
             return _split
         return []
 
-    def search_data_by_query(self,item,_query,confidence,table_name="document_tmp",table_index="document_tmp_index",sort_column="docid",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count]):
+    def search_data_by_query(self,item,_query,confidence,table_name="document_tmp",table_index="document_tmp_index",sort_column="docid",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count,document_tmp_doctitle]):
 
         list_data = []
         if isinstance(_query,list):
@@ -2205,6 +2205,8 @@ class Dataflow_dumplicate(Dataflow):
         else:
             _dict["project_code"] = ""
         _dict["doctitle_refine"] = _extract.get("doctitle_refine","")
+        if _dict["doctitle_refine"]=="":
+            _dict["doctitle_refine"] = _dict.get("doctitle")
         _dict["nlp_enterprise"] = str({"indoctextcon":_extract.get("nlp_enterprise",[]),
                                        "notindoctextcon":_extract.get("nlp_enterprise_attachment",[])})
         _dict["extract_count"] = self.c_f_get_extractCount.evaluate(extract_json)
@@ -2256,7 +2258,7 @@ class Dataflow_dumplicate(Dataflow):
             return the_group[:_index+1]
         return []
 
-    def dumplicate_check(self,_dict1,_dict2,min_counts,b_log=False):
+    def dumplicate_check(self,_dict1,_dict2,min_counts,b_log=True):
         document_less = _dict1
         docid_less = _dict1["docid"]
         docchannel_less = document_less["docchannel"]
@@ -3894,7 +3896,7 @@ class Dataflow_dumplicate(Dataflow):
                 singleNum_keys = _rule["singleNum_keys"]
                 contain_keys = _rule["contain_keys"]
                 multiNum_keys = _rule["multiNum_keys"]
-                self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district])
+                self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle])
                 _i += step
 
 
@@ -4173,7 +4175,7 @@ if __name__ == '__main__':
     df_dump = Dataflow_dumplicate(start_delete_listener=False)
     # df_dump.start_flow_dumplicate()
     a = time.time()
-    df_dump.test_dumplicate(349638765)
+    df_dump.test_dumplicate(359517787)
     # df_dump.test_merge([292315564],[287890754])
     # df_dump.flow_remove_project_tmp()
     print("takes",time.time()-a)

+ 12 - 4
BaseDataMaintenance/maintenance/product/1.py

@@ -1,8 +1,16 @@
-
+#coding:utf8
 
 import re
-from BaseDataMaintenance.maintenance.product.htmlparser import ParseDocument
+from bs4 import BeautifulSoup
 
-pd = ParseDocument("_html",False)
+p = '''
+包名称:包B:电脑恒温电蜡疗仪,全自动红外母乳分析仪,生物反馈治疗仪,磁刺激仪,多参数生物反馈仪、婴幼儿养育照护指导中心综合管理平台、多功能婴儿培养箱供应商名称:济南旭博医疗设备有限公司
+<table border="1"><tbody><tr><td colspan="1">货物名称</td><td colspan="1">品牌</td><td colspan="1">产地</td><td colspan="1">规格要求</td><td colspan="1">单价(元)/优惠率</td><td colspan="1">数量/单位</td></tr><tr><td colspan="1">婴幼儿养育照护指导中心综合管理平台</td><td colspan="1">北京零六</td><td colspan="1">北京/北京零六爱成长健康科技有限公司</td><td colspan="1">爱成长</td><td colspan="1">220000.000000</td><td colspan="1">1套</td></tr><tr><td colspan="1">电脑恒温电蜡疗仪</td><td colspan="1">苏州好博 </td><td colspan="1">苏州/苏州好博医疗器械股份有限公司</td><td colspan="1">HB-LY3</td><td colspan="1">104000.000000</td><td colspan="1">1台</td></tr><tr><td colspan="1">全自动红外母乳分析仪</td><td colspan="1">泰安康宇</td><td colspan="1">泰安/泰安市康宇医疗器械有限公司</td><td colspan="1">KY-9002</td><td colspan="1">200000.000000</td><td colspan="1">1台</td></tr><tr><td colspan="1">多功能婴儿培养箱</td><td colspan="1">宁波戴维</td><td colspan="1">宁波/宁波戴维医疗器械股份有限公司</td><td colspan="1">YP-3000</td><td colspan="1">302000.000000</td><td colspan="1">2台</td></tr><tr><td colspan="1">多参数生物反馈仪</td><td colspan="1">南京伟思</td><td colspan="1">南京/南京伟思医疗科技股份有限公司</td><td colspan="1">Infiniti3000C</td><td colspan="1">220000.000000</td><td colspan="1">1台</td></tr><tr><td colspan="1">生物反馈治疗仪</td><td colspan="1">南京锐诗得</td><td colspan="1">南京/南京锐诗得医疗科技有限公司</td><td colspan="1">RSD RM4</td><td colspan="1">87000.000000</td><td colspan="1">1台</td></tr><tr><td colspan="1">磁刺激仪</td><td colspan="1">南京伟思</td><td colspan="1">南京/南京伟思医疗科技股份有限公司</td><td colspan="1">Magneuro100HZ</td><td colspan="1">355000.000000</td><td colspan="1">1台</td></tr></tbody></table>
 
-print("==",pd.find_title_by_pattern("H、全自动糖化血红蛋白"))
+'''
+_text = BeautifulSoup(p,"html5lib").get_text()
+print(_text)
+meter_pattern = "[><≤≥±]\d+|\d+(?:[μucmkK微毫千]?[米升LlgGmMΩ]|摄氏度|英寸|度|天|VA|dB|bpm|rpm|kPa|mol|cmH20|%|°|Mpa|Hz|K?HZ|℃|W|min|[*×xX])|[*×xX]\d+|/min|\ds[^a-zA-Z]|GB.{,20}标准|PVC|PP|角度|容积|色彩|自动|流量|外径|轴位|折射率|帧率|柱镜|振幅|磁场|镜片|防漏|强度|允差|心率|倍数|瞳距|底座|色泽|噪音|间距|材质|材料|表面|频率|阻抗|浓度|兼容|防尘|防水|内径|实时|一次性|误差|性能|距离|精确|温度|超温|范围|跟踪|对比度|亮度|[横纵]向|均压|负压|正压|可调|设定值|功能|检测|高度|厚度|宽度|深度|[单双多]通道|效果|指数|模式|尺寸|重量|峰值|谷值|容量|寿命|稳定性|高温|信号|电源|电流|转换率|效率|释放量|转速|离心力|向心力|弯曲|电压|功率|气量|国标|标准协议|灵敏度|最大值|最小值|耐磨|波形|高压|性强|工艺|光源|低压|压力|压强|速度|湿度|重量|毛重|[MLX大中小]+码|净重|颜色|[红橙黄绿青蓝紫]色|不锈钢|输入|输出|噪声|认证|配置"
+not_meter_pattern = "投标报价|中标金额|商务部分|公章|分值构成|业绩|详见|联系人|联系电话|合同价|金额|采购预算|资金来源|费用|质疑|评审因素|评审标准|商务资信|商务评分|总价|专家论证意见|评标方法|代理服务费|售后服务|邮政编码|评分类型|评分项目|预算金额|得\d+分|项目金额|详见招标文件|乙方|甲方|合同|报价|采购人|技术支持服务"
+print(list(set(re.findall(meter_pattern,_text))))
+print(list(set(re.findall(not_meter_pattern,_text))))

+ 78 - 28
BaseDataMaintenance/maintenance/product/htmlparser.py

@@ -14,8 +14,9 @@ from bs4 import BeautifulSoup
 import copy
 
 end_pattern = "商务要求|评分标准|商务条件|商务条件"
-_param_pattern = "(产品|技术|清单|配置|参数|具体|明细|项目|招标|货物|服务|规格|工作|具体)[及和与]?(指标|配置|条件|要求|参数|需求|规格|名称及要求)|配置清单|(质量|技术).{,10}要求|验收标准|^(参数|功能)$"
-meter_pattern = "\d+([毫μ千]?[升L]|摄氏度|kg|mm|um|cm|mol|Mpa|Hz|℃|W|min)|角度|容积|色彩|帧率|磁场|强度|允差|噪音|材质|频率|阻抗|浓度|范围|误差|精确|温度|可调|设定值|功能|检测|高度|宽度|模式|尺寸|重量|峰值|容量|寿命|稳定性|高温|电源|电压|功率|压力|压强"
+_param_pattern = "(产品|技术|清单|配置|参数|具体|明细|项目|招标|货物|服务|规格|工作|具体)[及和与]?(指标|配置|条件|要求|参数|需求|规格|条款|名称及要求)|配置清单|(质量|技术).{,10}要求|验收标准|^(参数|功能)$"
+meter_pattern = "[><≤≥±]\d+|\d+(?:[μucmkK微毫千]?[米升LlgGmMΩ]|摄氏度|英寸|度|天|VA|dB|bpm|rpm|kPa|mol|cmH20|%|°|Mpa|Hz|K?HZ|℃|W|min|[*×xX])|[*×xX]\d+|/min|\ds[^a-zA-Z]|GB.{,20}标准|PVC|PP|角度|容积|色彩|自动|流量|外径|轴位|折射率|帧率|柱镜|振幅|磁场|镜片|防漏|强度|允差|心率|倍数|瞳距|底座|色泽|噪音|间距|材质|材料|表面|频率|阻抗|浓度|兼容|防尘|防水|内径|实时|一次性|误差|性能|距离|精确|温度|超温|范围|跟踪|对比度|亮度|[横纵]向|均压|负压|正压|可调|设定值|功能|检测|高度|厚度|宽度|深度|[单双多]通道|效果|指数|模式|尺寸|重量|峰值|谷值|容量|寿命|稳定性|高温|信号|电源|电流|转换率|效率|释放量|转速|离心力|向心力|弯曲|电压|功率|气量|国标|标准协议|灵敏度|最大值|最小值|耐磨|波形|高压|性强|工艺|光源|低压|压力|压强|速度|湿度|重量|毛重|[MLX大中小]+码|净重|颜色|[红橙黄绿青蓝紫]色|不锈钢|输入|输出|噪声|认证|配置"
+not_meter_pattern = "投标报价|中标金额|商务部分|公章|分值构成|业绩|详见|联系人|联系电话|合同价|金额|采购预算|资金来源|费用|质疑|评审因素|评审标准|商务资信|商务评分|专家论证意见|评标方法|代理服务费|售后服务|评分类型|评分项目|预算金额|得\d+分|项目金额|详见招标文件|乙方"
 
 
 def getTrs(tbody):
@@ -696,7 +697,6 @@ def extract_products(list_data,_product,_param_pattern = "产品名称|设备材
                             if re.search("^\d+$",cell_text) is not None:
                                 has_number = True
 
-
                         if cell_i>=len(line):
                             continue
                         cell = line[cell_i]
@@ -707,7 +707,7 @@ def extract_products(list_data,_product,_param_pattern = "产品名称|设备材
 
                     if len(table_products)>0:
                         logger.debug("table products %s"%(str(table_products)))
-                        if min([len(x) for x in table_products])>0 and max([len(x) for x in table_products])<=20:
+                        if min([len(x) for x in table_products])>0 and max([len(x) for x in table_products])<=30:
                             if re.search("招标人|代理人|预算|数量|交货期|品牌|产地","".join(table_products)) is None:
                                 list_table_products.append(table_products)
     _find = False
@@ -715,12 +715,13 @@ def extract_products(list_data,_product,_param_pattern = "产品名称|设备材
         for _p in table_products:
             if is_similar(_product,_p,90):
                 _find = True
-                list_result = list(set([a for a in table_products if len(a)>1 and len(a)<20 and re.search("费用|预算|合计|金额|万元|运费",a) is None]))
+                logger.debug("similar table_products %s"%(str(table_products)))
+                list_result = list(set([a for a in table_products if len(a)>1 and len(a)<20 and re.search("费用|预算|合计|金额|万元|运费|^其他$",a) is None]))
                 break
     if not _find:
         for table_products in list_table_products:
             list_result.extend(table_products)
-        list_result = list(set([a for a in list_result if len(a)>1 and len(a)<20 and re.search("费用|预算|合计|金额|万元|运费",a) is None]))
+        list_result = list(set([a for a in list_result if len(a)>1 and len(a)<30 and re.search("费用|预算|合计|金额|万元|运费",a) is None]))
     return list_result
 
 
@@ -767,12 +768,20 @@ def get_correct_product(product,products):
 def get_childs_text(childs,_product,products,is_begin=False,is_end=False):
     _text = ""
 
+    end_next = False
     for _child in childs:
 
         child_text = _child.get("text")
 
+
         if child_text.find(_product)>=0:
-            is_begin = True
+            if not is_begin:
+                is_begin = True
+                if not end_next:
+                    if _child["sentence_title"] is not None and isinstance(_child["title_next"],dict) and _child["title_next"]["sentence_title"] is not None:
+                        end_next = True
+                        end_title = _child["title_next"]
+                        logger.debug("end_title %s "%end_title["text"])
 
         logger.debug("%s-%s-%s"%("get_childs_text",child_text[:10],str(is_begin)))
 
@@ -797,31 +806,43 @@ def get_childs_text(childs,_product,products,is_begin=False,is_end=False):
             _text += _child.get("text")+"\r\n"
         childs2 = _child.get("child_title",[])
 
+
         if len(childs2)>0:
             for _child2 in childs2:
                 child_text,is_begin,is_end = get_childs_text([_child2],_product,products,is_begin)
-                if is_begin and is_end:
-                    break
-                else:
-                    if is_begin:
-                        _text += child_text
+                if is_begin:
+                    _text += child_text
+                    if is_end:
+                        break
+
+        if end_next:
+            is_end = True
+
+    #     logger.debug("%s-%s-%s"%("get_childs_text1",_text,str(is_begin)))
+    # logger.debug("%s-%s-%s"%("get_childs_text2",_text,str(is_begin)))
     return _text,is_begin,is_end
 
 def extract_parameters_by_tree(_product,products,list_data,_data_i,parent_title,list_result,):
     _data = list_data[_data_i]
     childs = _data.get("child_title",[])
     if len(childs)>0:
-        child_text,_,_ = get_childs_text([parent_title],_product,products)
-        logger.info("extract_parameters_by_tree child_text:%s"%child_text)
+        child_text,_,_ = get_childs_text([_data],_product,products)
         if len(child_text)>0:
+            logger.info("extract_type by_tree child_text:%s"%child_text)
             list_result.append(child_text)
     if parent_title is not None:
+        child_text,_,_ = get_childs_text([parent_title],_product,products)
+        if len(child_text)>0:
+            logger.info("extract_type by_tree child_text:%s"%child_text)
+            list_result.append(child_text)
+
         childs = parent_title.get("child_title",[])
         if len(childs)>0:
 
             range_data = get_range_data_by_childs(list_data[_data_i:],childs)
             p_text = ""
             _find = False
+            end_id = id(_data["title_next"]) if isinstance(_data["sentence_title"],dict) and _data["title_next"] is not None and _data["title_next"]["sentence_title"] is not None else None
             for pdata in range_data:
                 ptext = pdata["text"]
                 for p in products:
@@ -832,8 +853,11 @@ def extract_parameters_by_tree(_product,products,list_data,_data_i,parent_title,
                     _find = True
                 if _find:
                     break
+                if id(pdata)==end_id:
+                    break
                 p_text += ptext+"\r\n"
             if len(p_text)>0:
+                logger.debug("extract_type by parent range_text:%s"%p_text)
                 list_result.append(p_text)
                 return True
     return False
@@ -861,8 +885,8 @@ def get_table_pieces(_text,_product,products,list_result,_find):
             if _find:
                 list_trs.append(tr)
         if len(list_trs)>0:
-            logger.debug("extract_type table slices")
             table_html = "<table>%s</table>"%("\r\n".join([str(a) for a in list_trs]))
+            logger.debug("extract_type table slices %s"%(table_html))
             list_result.append(table_html)
 
 def extract_parameters_by_table(_product,products,_param_pattern,list_data,_data_i,list_result):
@@ -887,8 +911,8 @@ def extract_parameters_by_table(_product,products,_param_pattern,list_data,_data
         if re.search(_param_pattern,text_line_first) is not None and text_line_first.find(_product)>=0:
             _flag = True
         if _flag:
-            logger.debug("extract_type add all table %s"%_text)
             if len(products)==0:
+                logger.debug("extract_type whole table by param and product %s"%(_text))
                 list_result.append(_text)
             else:
                 for p in products:
@@ -911,15 +935,26 @@ def extract_parameters_by_table(_product,products,_param_pattern,list_data,_data
             for line in list_table:
                 for cell in line:
                     cell_text = cell[0]
-                    if len(cell_text)>50 and len(re.findall("\d+",cell_text))>10 and cell_text.find(_product)>=0:
-                        list_result.append(cell_text)
+                    if len(cell_text)>50 and len(re.findall(meter_pattern,cell_text))>5 and cell_text.find(_product)>=0:
+                        _f = True
+                        for cell in line:
+                            if not _f:
+                                break
+                            cell_text = cell[0]
+                            for p in products:
+                                if cell_text.find(p)>=0 and p!=_product:
+                                    _f = False
+                                    break
+                        if _f:
+                            logger.debug("extract_type param column %s"%(cell_text))
+                            list_result.append(cell_text)
                     if len(cell_text)<len(_product)*10 and str(cell_text).find(_product)>=0:
                         for _index in list_head_index:
                             if _index>=len(line):
                                 continue
                             _cell = line[_index]
                             if len(cell[0])>0:
-                                logger.info("%s-%s"%("extract_type add on table",_cell[0]))
+                                logger.info("%s-%s"%("extract_type add on table text:",_cell[0]))
                                 list_result.append(_cell[0])
         if not _flag and (re.search(_param_pattern,_text) is not None or (parent_title is not None and re.search(_param_pattern,parent_title["text"]) is not None)) and _text.find(_product)>=0:
             get_table_pieces(_text,_product,products,list_result,False)
@@ -984,14 +1019,14 @@ def extract_parameters_by_sentence(list_data,_data,_data_i,_product,products,lis
     if re.search(_param_pattern,_text) is not None and len(_text)<50:
         childs = _data["child_title"]
         if len(childs)>0:
-            logger.debug("extract_type sentence %s"%("re.search(_param_pattern,_text) is not None and len(_text)<50:"))
             extract_text,_,_ = get_childs_text([_data],_product,products)
             if len(extract_text)>0:
+                logger.debug("extract_type param-product %s"%(extract_text))
                 list_result.append(extract_text)
             elif is_project:
-                logger.debug("extract_type sentence is_project")
                 extract_text,_,_ = get_childs_text([_data],_product,products,is_begin=True)
                 if len(extract_text)>0 and re.search(meter_pattern,extract_text) is not None:
+                    logger.debug("extract_type sentence is_project param-product is product %s"%(extract_text))
                     list_result.append(extract_text)
 
 def getBestProductText(list_result,_product,products):
@@ -1006,7 +1041,7 @@ def getBestProductText(list_result,_product,products):
         _result = list_result[i]
         _check = True
         _result_text = BeautifulSoup(_result,"html5lib").get_text()
-        _search = re.search("项目编号[::]|项目名称[::]|联合体投标",_result)
+        _search = re.search("项目编号[::]|项目名称[::]|联合体投标|开户银行",_result)
         if _search is not None:
             logger.debug("result%d error illegal text %s"%(i,str(_search)))
             _check = False
@@ -1015,7 +1050,7 @@ def getBestProductText(list_result,_product,products):
                 if _result_text.find(p)>0 and not (is_similar(_product,p,80) or p.find(_product)>=0 or _product.find(p)>=0):
                     logger.debug("result%d error product scoss %s"%(i,p))
                     _check = False
-        if len(_result_text)<50:
+        if len(_result_text)<100:
             if re.search(meter_pattern,_result_text) is None:
                 logger.debug("result%d error text min count"%(i))
                 _check = False
@@ -1027,6 +1062,19 @@ def getBestProductText(list_result,_product,products):
                 logger.debug("result%d error text max count less meter"%(i))
                 _check = False
 
+        list_find = list(set(re.findall(meter_pattern,_result_text)))
+
+        not_list_find = list(set(re.findall(not_meter_pattern,_result_text)))
+        _count = len(list_find)-len(not_list_find)
+        has_num = False
+        for _find in list_find:
+            if re.search('[0-9a-zA-Z]',_find) is not None:
+                has_num = True
+                break
+        if not(_count>=2 and has_num or _count>=5):
+            logger.debug("result%d error match not enough"%(i))
+            _check = False
+
         if _check:
             return _result
 
@@ -1052,7 +1100,7 @@ def extract_product_parameters(list_data,_product):
         if _type=="sentence":
             if _text.find(_product)>=0:
                 _find_count += 1
-                if re.search("项目名称|采购项目",_text) is not None:
+                if re.search("项目名称|采购项目",_text) is not None and re.search("等",_text) is not None:
                     is_project = True
             extract_parameters_by_sentence(list_data,_data,_data_i,_product,products,list_result,is_project)
 
@@ -1061,13 +1109,14 @@ def extract_product_parameters(list_data,_product):
                 _find_count += 1
             extract_parameters_by_table(_product,products,_param_pattern,list_data,_data_i,list_result)
 
-    return getBestProductText(list_result,_product,products),_find_count
+    _text = getBestProductText(list_result,_product,products)
+    return _text,_find_count
 
 
 if __name__ == '__main__':
 
-    filepath = "download/fa85b009fad13bd5e48ae1a51d2e3175.html"
-    _product = "自动体外除颤器(AED)等"
+    filepath = "download/4597dcc128bfabc7584d10590ae50656.html"
+    _product = "彩色多普勒超声诊断仪"
 
     _html = open(filepath, "r", encoding="utf8").read()
 
@@ -1079,5 +1128,6 @@ if __name__ == '__main__':
 
     _text,_count = extract_product_parameters(list_data,_product)
     logger.info("find count:%d"%(_count))
-    logger.info("extract_text %s"%_text)
+    logger.info("extract_parameter_text::%s"%(_text))
+
 

+ 113 - 88
BaseDataMaintenance/maintenance/product/product_attachment.py

@@ -11,6 +11,7 @@ from BaseDataMaintenance.model.ots.attachment import *
 from BaseDataMaintenance.common.Utils import *
 from BaseDataMaintenance.common.ossUtils import *
 from BaseDataMaintenance.maintenance.product.htmlparser import *
+from BaseDataMaintenance.maintenance.product.productUtils import pool_product
 import oss2
 from BaseDataMaintenance.common.multiThread import MultiThreadHandler
 
@@ -21,6 +22,8 @@ parameter_status_process_failed = 2
 parameter_status_process_jump = 3
 parameter_status_not_found = 4
 
+import redis
+
 class Product_Attachment_Processor():
 
     def __init__(self,):
@@ -41,6 +44,7 @@ class Product_Attachment_Processor():
         self.bucket = oss2.Bucket(self.auth,self.bucket_url,self.attachment_bucket_name)
         self.current_path = os.path.dirname(__file__)
         self.download_path = "%s/%s"%(self.current_path,"download")
+        self.test_url="http://192.168.2.102:15011/convert"
 
     def process_parameters_producer(self,):
 
@@ -52,7 +56,7 @@ class Product_Attachment_Processor():
         list_id = []
         rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
                                                                             SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("parameter_status")]),limit=100,get_total_count=True),
-                                                                            ColumnsToGet([DOCUMENT_PRODUCT_BID_FILEMD5S,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME],return_type=ColumnReturnType.SPECIFIED))
+                                                                            ColumnsToGet([DOCUMENT_PRODUCT_ATTACHMENTS,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME],return_type=ColumnReturnType.SPECIFIED))
 
         list_data = getRow_ots(rows)
         for data in list_data:
@@ -66,7 +70,7 @@ class Product_Attachment_Processor():
                 break
             rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
                                                                                 SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
-                                                                                ColumnsToGet([DOCUMENT_PRODUCT_BID_FILEMD5S,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME],return_type=ColumnReturnType.SPECIFIED))
+                                                                                ColumnsToGet([DOCUMENT_PRODUCT_ATTACHMENTS,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME],return_type=ColumnReturnType.SPECIFIED))
             list_data = getRow_ots(rows)
             for data in list_data:
                 _id = data.get(DOCUMENT_PRODUCT_ID)
@@ -76,38 +80,35 @@ class Product_Attachment_Processor():
                 list_id.append(_id)
         self.set_product_attachment =  set(list_id)
 
-    def process_parameters_handler(self,item,result_queue):
-        bid_filemd5s = item.get(DOCUMENT_PRODUCT_BID_FILEMD5S)
-        product_name = item.get(DOCUMENT_PRODUCT_NAME)
-        product_original_name = item.get(DOCUMENT_PRODUCT_ORIGINAL_NAME)
-        list_product = []
-        if product_name is not None:
-            list_product.append(product_name)
-        if product_original_name is not None:
-            list_product.extend(product_original_name.split("_"))
-        list_product = list(set(list_product))
-        dp = Document_product(item)
-        if bid_filemd5s is None or bid_filemd5s=="" or len(list_product)==0:
-            dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_no_bidfile)
-            dp.update_row(self.ots_client)
-            return
-        list_filemd5 = bid_filemd5s.split(",")
-        _find = False
-        _success = False
-        for _filemd5 in list_filemd5:
-            if _find:
-                break
-            atta = attachment({attachment_filemd5:_filemd5})
+    def get_whole_html(self,_filemd5):
+        atta = attachment({attachment_filemd5:_filemd5})
+        _html = ""
+
+        db = redis.Redis(connection_pool=pool_product)
+        _key = "filemd5:%s"%(_filemd5)
+
+        _cache_html = None
+        try:
+            _cache_html = db.get(_key)
+        except Exception as e:
+            logger.info("get redis cache html error")
+        
+        if _cache_html is not None:
+            _html = _cache_html
+        else:
             if atta.fix_columns(self.ots_client,[attachment_path,attachment_filetype],True):
                 objectPath = atta.getProperties().get(attachment_path)
                 _filetype = atta.getProperties().get(attachment_filetype)
-                if _filetype in ("doc","xls"):
-                    if len(list_filemd5)==1:
-                        dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_jump,True)
-                        dp.update_row(self.ots_client)
-                        return
-                    else:
-                        continue
+
+                # not supported on windows
+                # if _filetype in ("doc","xls"):
+                #     if len(list_filemd5)==1:
+                #         dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_jump,True)
+                #         dp.update_row(self.ots_client)
+                #         return
+                #     else:
+                #         continue
+
                 localpath = "%s/%s.%s"%(self.download_path,_filemd5,_filetype)
                 localhtml = "%s/%s.%s"%(self.download_path,_filemd5,"html")
                 download_succeed = False
@@ -120,7 +121,7 @@ class Product_Attachment_Processor():
                     download_succeed = False
                 if download_succeed:
                     try:
-                        _html = ""
+
                         if os.path.exists(localhtml):
                             _html = open(localhtml,"r",encoding="utf8").read()
                             _success = True
@@ -128,79 +129,103 @@ class Product_Attachment_Processor():
                             _success = True
                         else:
                             _data_base64 = base64.b64encode(open(localpath,"rb").read())
-                            _success,_html,swf_images,classification = getAttachDealInterface(_data_base64,_filetype,url="http://192.168.2.102:15011/convert",kwargs={'page_no': '1,-1',"max_bytes":"-1"},timeout=6000)
+                            _success,_html,swf_images,classification = getAttachDealInterface(_data_base64,_filetype,kwargs={'page_no': '1,-1',"max_bytes":"-1"},timeout=6000)
+
                             if _success:
-                                localhtml = "%s/%s.%s"%(self.download_path,_filemd5,"html")
-                                with open(localhtml,"w",encoding="utf8") as f:
-                                    f.write(_html)
-                        if _success:
-                            if len(_html)>5:
-                                pd = ParseDocument(_html,True)
-
-                                list_text = []
-                                for _product in list_product:
-                                    pd.fix_tree(_product)
-                                    list_data = pd.tree
-                                    _text,_count = extract_product_parameters(list_data,_product)
-                                    if _count>0:
-                                        _find = True
-                                    if _text is not None:
-                                        list_text.append(_text)
-                                pd = ParseDocument(_html,False)
-
-                                list_text = []
-                                for _product in list_product:
-                                    pd.fix_tree(_product)
-                                    list_data = pd.tree
-                                    _text,_count = extract_product_parameters(list_data,_product)
-                                    if _count>0:
-                                        _find = True
-                                    if _text is not None:
-                                        list_text.append(_text)
-                                if len(list_text)>0:
-                                    list_text.sort(key=lambda x:len(re.findall('[::;;]',BeautifulSoup(x,"html5lib").get_text())), reverse=True)
-                                    _text = list_text[0]
-                                    _success = True
-                                    dp.setValue(DOCUMENT_PRODUCT_PARAMETER,_text,True)
-                                    dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_succeed,True)
-                                    dp.update_row(self.ots_client)
-                                    return
-                            else:
-                                log("product attachment process filemd5 %s has no content"%(_filemd5))
+                                db.set(_key,_html,24*60*60)
+                                # save for dubug
+                                # localhtml = "%s/%s.%s"%(self.download_path,_filemd5,"html")
+                                # with open(localhtml,"w",encoding="utf8") as f:
+                                #     f.write(_html)
+
                     except Exception as e:
                         traceback.print_exc()
                     finally:
                         try:
-                            # if os.path.exists(localpath):
-                            #     os.remove(localpath)
+                            if os.path.exists(localpath):
+                                os.remove(localpath)
                             pass
                         except Exception as e:
                             pass
+        return _html
+
+    def process_parameters_handler(self,item,result_queue):
+        attachments = item.get(DOCUMENT_PRODUCT_ATTACHMENTS)
+        product_name = item.get(DOCUMENT_PRODUCT_NAME)
+        product_original_name = item.get(DOCUMENT_PRODUCT_ORIGINAL_NAME)
+        list_product = []
+        if product_original_name is not None:
+            _l = product_original_name.split("_")
+            _l.reverse()
+            list_product.extend(_l)
+        if product_name is not None:
+            list_product.append(product_name)
+        list_product = list(set(list_product))
+        dp = Document_product(item)
+        if attachments is None or attachments=="" or len(list_product)==0:
+            dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_no_bidfile)
+            dp.update_row(self.ots_client)
+            return
+        list_attachment = json.loads(attachments)
+        list_filemd5 = [a.get("filemd5","") for a in list_attachment]
+        _find = False
+        _success = False
+        list_text = []
+        for _filemd5 in list_filemd5:
+            _html = self.get_whole_html(_filemd5)
+            if len(_html)>5:
+
+                pd = ParseDocument(_html,True)
+                for _product in list_product:
+                    pd.fix_tree(_product)
+                    list_data = pd.tree
+                    _text,_count = extract_product_parameters(list_data,_product)
+                    if _count>0:
+                        _find = True
+                    if _text is not None:
+                        list_text.append(_text)
 
-        if not _find:
-            dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_not_found,True)
+                pd = ParseDocument(_html,False)
+                for _product in list_product:
+                    pd.fix_tree(_product)
+                    list_data = pd.tree
+                    _text,_count = extract_product_parameters(list_data,_product)
+                    if _count>0:
+                        _find = True
+                    if _text is not None:
+                        list_text.append(_text)
+            else:
+                log("product attachment process filemd5 %s has no content"%(_filemd5))
+
+        if len(list_text)>0:
+            _text = getBestProductText(list_text,'',[])
+            logger.info("extract_parameter_text bid_filemd5s:%s name:%s original_name:%s parameter_text:%s"%(str(list_filemd5),product_name,product_original_name,_text))
+            dp.setValue(DOCUMENT_PRODUCT_PARAMETER,_text,True)
+            dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_succeed,True)
             dp.update_row(self.ots_client)
         else:
-            dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_failed,True)
-            dp.update_row(self.ots_client)
+            if not _find:
+                dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_not_found,True)
+                dp.update_row(self.ots_client)
+            else:
+                dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_failed,True)
+                dp.update_row(self.ots_client)
 
     def start_process(self):
-        mt = MultiThreadHandler(self.product_attachment_queue,self.process_parameters_handler,None,3,need_stop=False,restart=True)
+        mt = MultiThreadHandler(self.product_attachment_queue,self.process_parameters_handler,None,2,need_stop=False,restart=True)
         mt.run()
 
     def process_parameters_comsumer(self,):
+        process_count = 2
+        list_process = []
+        for i in range(process_count):
+            p = Process(target=self.start_process)
+            list_process.append(p)
+        for p in list_process:
+            p.start()
+        for p in list_process:
+            p.join()
 
-        # process_count = 2
-        # list_process = []
-        # for i in range(process_count):
-        #     p = Process(target=self.start_process)
-        #     list_process.append(p)
-        # for p in list_process:
-        #     p.start()
-        # for p in list_process:
-        #     p.join()
-
-        self.start_process()
 
     def start_process_parameters(self):
         scheduler = BlockingScheduler()

Datei-Diff unterdrückt, da er zu groß ist
+ 1 - 1
BaseDataMaintenance/maxcompute/documentDumplicate.py


+ 11 - 11
BaseDataMaintenance/model/ots/document.py

@@ -321,11 +321,15 @@ def turn_document_status():
         #     # must_not_queries=[WildcardQuery("DX004354*")]
         # )
         bool_query = BoolQuery(
-            must_queries=[
-                RangeQuery("crtime","2023-08-30 15:00:00","2023-08-30 23:59:59"),
-                NestedQuery("page_attachments",ExistsQuery("page_attachments.fileMd5"))
-            ],
-            must_not_queries=[WildcardQuery("attachmenttextcon","*")]
+            # must_queries=[
+            #     RangeQuery("crtime","2023-08-30 15:00:00","2023-08-30 23:59:59"),
+            #     NestedQuery("page_attachments",ExistsQuery("page_attachments.fileMd5"))
+            # ],
+            # must_not_queries=[WildcardQuery("attachmenttextcon","*")],
+            should_queries=[
+                NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer","个体工商户")),
+                NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer","机械设备")),
+            ]
 
         )
 
@@ -337,9 +341,7 @@ def turn_document_status():
         _count = len(list_data)
         for _data in list_data:
             _document = Document(_data)
-            _attachment = _data.get(document_attachmenttextcon,"")
-            if _attachment=="":
-                task_queue.put(_document)
+            task_queue.put(_document)
         while next_token:
             rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                            SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
@@ -349,9 +351,7 @@ def turn_document_status():
             print("%d/%d"%(_count,total_count))
             for _data in list_data:
                 _document = Document(_data)
-                _attachment = _data.get(document_attachmenttextcon,"")
-                if _attachment=="":
-                    task_queue.put(_document)
+                task_queue.put(_document)
 
         # docids = [223820830,224445409]
         # for docid in docids:

Einige Dateien werden nicht angezeigt, da zu viele Dateien in diesem Diff geändert wurden.