vor 1 Jahr · f7ab637cbe
--- a/BaseDataMaintenance/maintenance/dataflow.py
+++ b/BaseDataMaintenance/maintenance/dataflow.py
@@ -499,7 +499,7 @@ class Dataflow():
 
				             return _split
			
 
				         return []
			
 
				 
			
 
				-    def search_data_by_query(self,item,_query,confidence,table_name="document_tmp",table_index="document_tmp_index",sort_column="docid",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count]):
			
 
				+    def search_data_by_query(self,item,_query,confidence,table_name="document_tmp",table_index="document_tmp_index",sort_column="docid",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count,document_tmp_doctitle]):
			
 
				 
			
 
				         list_data = []
			
 
				         if isinstance(_query,list):
			
@@ -2205,6 +2205,8 @@ class Dataflow_dumplicate(Dataflow):
 
				         else:
			
 
				             _dict["project_code"] = ""
			
 
				         _dict["doctitle_refine"] = _extract.get("doctitle_refine","")
			
 
				+        if _dict["doctitle_refine"]=="":
			
 
				+            _dict["doctitle_refine"] = _dict.get("doctitle")
			
 
				         _dict["nlp_enterprise"] = str({"indoctextcon":_extract.get("nlp_enterprise",[]),
			
 
				                                        "notindoctextcon":_extract.get("nlp_enterprise_attachment",[])})
			
 
				         _dict["extract_count"] = self.c_f_get_extractCount.evaluate(extract_json)
			
@@ -2256,7 +2258,7 @@ class Dataflow_dumplicate(Dataflow):
 
				             return the_group[:_index+1]
			
 
				         return []
			
 
				 
			
 
				-    def dumplicate_check(self,_dict1,_dict2,min_counts,b_log=False):
			
 
				+    def dumplicate_check(self,_dict1,_dict2,min_counts,b_log=True):
			
 
				         document_less = _dict1
			
 
				         docid_less = _dict1["docid"]
			
 
				         docchannel_less = document_less["docchannel"]
			
@@ -3894,7 +3896,7 @@ class Dataflow_dumplicate(Dataflow):
 
				                 singleNum_keys = _rule["singleNum_keys"]
			
 
				                 contain_keys = _rule["contain_keys"]
			
 
				                 multiNum_keys = _rule["multiNum_keys"]
			
 
				-                self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district])
			
 
				+                self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle])
			
 
				                 _i += step
			
 
				 
			
 
				 
			
@@ -4173,7 +4175,7 @@ if __name__ == '__main__':
 
				     df_dump = Dataflow_dumplicate(start_delete_listener=False)
			
 
				     # df_dump.start_flow_dumplicate()
			
 
				     a = time.time()
			
 
				-    df_dump.test_dumplicate(349638765)
			
 
				+    df_dump.test_dumplicate(359517787)
			
 
				     # df_dump.test_merge([292315564],[287890754])
			
 
				     # df_dump.flow_remove_project_tmp()
			
 
				     print("takes",time.time()-a)
			
--- a/BaseDataMaintenance/maintenance/product/1.py
+++ b/BaseDataMaintenance/maintenance/product/1.py
@@ -1,8 +1,16 @@
 
				-
			
 
				+#coding:utf8
			
 
				 
			
 
				 import re
			
 
				-from BaseDataMaintenance.maintenance.product.htmlparser import ParseDocument
			
 
				+from bs4 import BeautifulSoup
			
 
				 
			
 
				-pd = ParseDocument("_html",False)
			
 
				+p = '''
			
 
				+包名称：包B：电脑恒温电蜡疗仪，全自动红外母乳分析仪，生物反馈治疗仪，磁刺激仪，多参数生物反馈仪、婴幼儿养育照护指导中心综合管理平台、多功能婴儿培养箱供应商名称：济南旭博医疗设备有限公司
			
 
				+<table border="1"><tbody><tr><td colspan="1">货物名称</td><td colspan="1">品牌</td><td colspan="1">产地</td><td colspan="1">规格要求</td><td colspan="1">单价(元)/优惠率</td><td colspan="1">数量/单位</td></tr><tr><td colspan="1">婴幼儿养育照护指导中心综合管理平台</td><td colspan="1">北京零六</td><td colspan="1">北京/北京零六爱成长健康科技有限公司</td><td colspan="1">爱成长</td><td colspan="1">220000.000000</td><td colspan="1">1套</td></tr><tr><td colspan="1">电脑恒温电蜡疗仪</td><td colspan="1">苏州好博 </td><td colspan="1">苏州/苏州好博医疗器械股份有限公司</td><td colspan="1">HB-LY3</td><td colspan="1">104000.000000</td><td colspan="1">1台</td></tr><tr><td colspan="1">全自动红外母乳分析仪</td><td colspan="1">泰安康宇</td><td colspan="1">泰安/泰安市康宇医疗器械有限公司</td><td colspan="1">KY-9002</td><td colspan="1">200000.000000</td><td colspan="1">1台</td></tr><tr><td colspan="1">多功能婴儿培养箱</td><td colspan="1">宁波戴维</td><td colspan="1">宁波/宁波戴维医疗器械股份有限公司</td><td colspan="1">YP-3000</td><td colspan="1">302000.000000</td><td colspan="1">2台</td></tr><tr><td colspan="1">多参数生物反馈仪</td><td colspan="1">南京伟思</td><td colspan="1">南京/南京伟思医疗科技股份有限公司</td><td colspan="1">Infiniti3000C</td><td colspan="1">220000.000000</td><td colspan="1">1台</td></tr><tr><td colspan="1">生物反馈治疗仪</td><td colspan="1">南京锐诗得</td><td colspan="1">南京/南京锐诗得医疗科技有限公司</td><td colspan="1">RSD RM4</td><td colspan="1">87000.000000</td><td colspan="1">1台</td></tr><tr><td colspan="1">磁刺激仪</td><td colspan="1">南京伟思</td><td colspan="1">南京/南京伟思医疗科技股份有限公司</td><td colspan="1">Magneuro100HZ</td><td colspan="1">355000.000000</td><td colspan="1">1台</td></tr></tbody></table>
			
 
				 
			
 
				-print("==",pd.find_title_by_pattern("H、全自动糖化血红蛋白"))
			
 
				+'''
			
 
				+_text = BeautifulSoup(p,"html5lib").get_text()
			
 
				+print(_text)
			
 
				+meter_pattern = "[><≤≥±]\d+|\d+(?:[μucmkK微毫千]?[米升LlgGmMΩ]|摄氏度|英寸|度|天|VA|dB|bpm|rpm|kPa|mol|cmH20|%|°|Mpa|Hz|K?HZ|℃|W|min|[*×xX])|[*×xX]\d+|/min|\ds[^a-zA-Z]|GB.{,20}标准|PVC|PP|角度|容积|色彩|自动|流量|外径|轴位|折射率|帧率|柱镜|振幅|磁场|镜片|防漏|强度|允差|心率|倍数|瞳距|底座|色泽|噪音|间距|材质|材料|表面|频率|阻抗|浓度|兼容|防尘|防水|内径|实时|一次性|误差|性能|距离|精确|温度|超温|范围|跟踪|对比度|亮度|[横纵]向|均压|负压|正压|可调|设定值|功能|检测|高度|厚度|宽度|深度|[单双多]通道|效果|指数|模式|尺寸|重量|峰值|谷值|容量|寿命|稳定性|高温|信号|电源|电流|转换率|效率|释放量|转速|离心力|向心力|弯曲|电压|功率|气量|国标|标准协议|灵敏度|最大值|最小值|耐磨|波形|高压|性强|工艺|光源|低压|压力|压强|速度|湿度|重量|毛重|[MLX大中小]+码|净重|颜色|[红橙黄绿青蓝紫]色|不锈钢|输入|输出|噪声|认证|配置"
			
 
				+not_meter_pattern = "投标报价|中标金额|商务部分|公章|分值构成|业绩|详见|联系人|联系电话|合同价|金额|采购预算|资金来源|费用|质疑|评审因素|评审标准|商务资信|商务评分|总价|专家论证意见|评标方法|代理服务费|售后服务|邮政编码|评分类型|评分项目|预算金额|得\d+分|项目金额|详见招标文件|乙方|甲方|合同|报价|采购人|技术支持服务"
			
 
				+print(list(set(re.findall(meter_pattern,_text))))
			
 
				+print(list(set(re.findall(not_meter_pattern,_text))))
			
--- a/BaseDataMaintenance/maintenance/product/htmlparser.py
+++ b/BaseDataMaintenance/maintenance/product/htmlparser.py
@@ -14,8 +14,9 @@ from bs4 import BeautifulSoup
 
				 import copy
			
 
				 
			
 
				 end_pattern = "商务要求|评分标准|商务条件|商务条件"
			
 
				-_param_pattern = "(产品|技术|清单|配置|参数|具体|明细|项目|招标|货物|服务|规格|工作|具体)[及和与]?(指标|配置|条件|要求|参数|需求|规格|名称及要求)|配置清单|(质量|技术).{,10}要求|验收标准|^(参数|功能)$"
			
 
				-meter_pattern = "\d+([毫μ千]?[升L]|摄氏度|kg|mm|um|cm|mol|Mpa|Hz|℃|W|min)|角度|容积|色彩|帧率|磁场|强度|允差|噪音|材质|频率|阻抗|浓度|范围|误差|精确|温度|可调|设定值|功能|检测|高度|宽度|模式|尺寸|重量|峰值|容量|寿命|稳定性|高温|电源|电压|功率|压力|压强"
			
 
				+_param_pattern = "(产品|技术|清单|配置|参数|具体|明细|项目|招标|货物|服务|规格|工作|具体)[及和与]?(指标|配置|条件|要求|参数|需求|规格|条款|名称及要求)|配置清单|(质量|技术).{,10}要求|验收标准|^(参数|功能)$"
			
 
				+meter_pattern = "[><≤≥±]\d+|\d+(?:[μucmkK微毫千]?[米升LlgGmMΩ]|摄氏度|英寸|度|天|VA|dB|bpm|rpm|kPa|mol|cmH20|%|°|Mpa|Hz|K?HZ|℃|W|min|[*×xX])|[*×xX]\d+|/min|\ds[^a-zA-Z]|GB.{,20}标准|PVC|PP|角度|容积|色彩|自动|流量|外径|轴位|折射率|帧率|柱镜|振幅|磁场|镜片|防漏|强度|允差|心率|倍数|瞳距|底座|色泽|噪音|间距|材质|材料|表面|频率|阻抗|浓度|兼容|防尘|防水|内径|实时|一次性|误差|性能|距离|精确|温度|超温|范围|跟踪|对比度|亮度|[横纵]向|均压|负压|正压|可调|设定值|功能|检测|高度|厚度|宽度|深度|[单双多]通道|效果|指数|模式|尺寸|重量|峰值|谷值|容量|寿命|稳定性|高温|信号|电源|电流|转换率|效率|释放量|转速|离心力|向心力|弯曲|电压|功率|气量|国标|标准协议|灵敏度|最大值|最小值|耐磨|波形|高压|性强|工艺|光源|低压|压力|压强|速度|湿度|重量|毛重|[MLX大中小]+码|净重|颜色|[红橙黄绿青蓝紫]色|不锈钢|输入|输出|噪声|认证|配置"
			
 
				+not_meter_pattern = "投标报价|中标金额|商务部分|公章|分值构成|业绩|详见|联系人|联系电话|合同价|金额|采购预算|资金来源|费用|质疑|评审因素|评审标准|商务资信|商务评分|专家论证意见|评标方法|代理服务费|售后服务|评分类型|评分项目|预算金额|得\d+分|项目金额|详见招标文件|乙方"
			
 
				 
			
 
				 
			
 
				 def getTrs(tbody):
			
@@ -696,7 +697,6 @@ def extract_products(list_data,_product,_param_pattern = "产品名称|设备材
 
				                             if re.search("^\d+$",cell_text) is not None:
			
 
				                                 has_number = True
			
 
				 
			
 
				-
			
 
				                         if cell_i>=len(line):
			
 
				                             continue
			
 
				                         cell = line[cell_i]
			
@@ -707,7 +707,7 @@ def extract_products(list_data,_product,_param_pattern = "产品名称|设备材
 
				 
			
 
				                     if len(table_products)>0:
			
 
				                         logger.debug("table products %s"%(str(table_products)))
			
 
				-                        if min([len(x) for x in table_products])>0 and max([len(x) for x in table_products])<=20:
			
 
				+                        if min([len(x) for x in table_products])>0 and max([len(x) for x in table_products])<=30:
			
 
				                             if re.search("招标人|代理人|预算|数量|交货期|品牌|产地","".join(table_products)) is None:
			
 
				                                 list_table_products.append(table_products)
			
 
				     _find = False
			
@@ -715,12 +715,13 @@ def extract_products(list_data,_product,_param_pattern = "产品名称|设备材
 
				         for _p in table_products:
			
 
				             if is_similar(_product,_p,90):
			
 
				                 _find = True
			
 
				-                list_result = list(set([a for a in table_products if len(a)>1 and len(a)<20 and re.search("费用|预算|合计|金额|万元|运费",a) is None]))
			
 
				+                logger.debug("similar table_products %s"%(str(table_products)))
			
 
				+                list_result = list(set([a for a in table_products if len(a)>1 and len(a)<20 and re.search("费用|预算|合计|金额|万元|运费|^其他$",a) is None]))
			
 
				                 break
			
 
				     if not _find:
			
 
				         for table_products in list_table_products:
			
 
				             list_result.extend(table_products)
			
 
				-        list_result = list(set([a for a in list_result if len(a)>1 and len(a)<20 and re.search("费用|预算|合计|金额|万元|运费",a) is None]))
			
 
				+        list_result = list(set([a for a in list_result if len(a)>1 and len(a)<30 and re.search("费用|预算|合计|金额|万元|运费",a) is None]))
			
 
				     return list_result
			
 
				 
			
 
				 
			
@@ -767,12 +768,20 @@ def get_correct_product(product,products):
 
				 def get_childs_text(childs,_product,products,is_begin=False,is_end=False):
			
 
				     _text = ""
			
 
				 
			
 
				+    end_next = False
			
 
				     for _child in childs:
			
 
				 
			
 
				         child_text = _child.get("text")
			
 
				 
			
 
				+
			
 
				         if child_text.find(_product)>=0:
			
 
				-            is_begin = True
			
 
				+            if not is_begin:
			
 
				+                is_begin = True
			
 
				+                if not end_next:
			
 
				+                    if _child["sentence_title"] is not None and isinstance(_child["title_next"],dict) and _child["title_next"]["sentence_title"] is not None:
			
 
				+                        end_next = True
			
 
				+                        end_title = _child["title_next"]
			
 
				+                        logger.debug("end_title %s "%end_title["text"])
			
 
				 
			
 
				         logger.debug("%s-%s-%s"%("get_childs_text",child_text[:10],str(is_begin)))
			
 
				 
			
@@ -797,31 +806,43 @@ def get_childs_text(childs,_product,products,is_begin=False,is_end=False):
 
				             _text += _child.get("text")+"\r\n"
			
 
				         childs2 = _child.get("child_title",[])
			
 
				 
			
 
				+
			
 
				         if len(childs2)>0:
			
 
				             for _child2 in childs2:
			
 
				                 child_text,is_begin,is_end = get_childs_text([_child2],_product,products,is_begin)
			
 
				-                if is_begin and is_end:
			
 
				-                    break
			
 
				-                else:
			
 
				-                    if is_begin:
			
 
				-                        _text += child_text
			
 
				+                if is_begin:
			
 
				+                    _text += child_text
			
 
				+                    if is_end:
			
 
				+                        break
			
 
				+
			
 
				+        if end_next:
			
 
				+            is_end = True
			
 
				+
			
 
				+    #     logger.debug("%s-%s-%s"%("get_childs_text1",_text,str(is_begin)))
			
 
				+    # logger.debug("%s-%s-%s"%("get_childs_text2",_text,str(is_begin)))
			
 
				     return _text,is_begin,is_end
			
 
				 
			
 
				 def extract_parameters_by_tree(_product,products,list_data,_data_i,parent_title,list_result,):
			
 
				     _data = list_data[_data_i]
			
 
				     childs = _data.get("child_title",[])
			
 
				     if len(childs)>0:
			
 
				-        child_text,_,_ = get_childs_text([parent_title],_product,products)
			
 
				-        logger.info("extract_parameters_by_tree child_text:%s"%child_text)
			
 
				+        child_text,_,_ = get_childs_text([_data],_product,products)
			
 
				         if len(child_text)>0:
			
 
				+            logger.info("extract_type by_tree child_text:%s"%child_text)
			
 
				             list_result.append(child_text)
			
 
				     if parent_title is not None:
			
 
				+        child_text,_,_ = get_childs_text([parent_title],_product,products)
			
 
				+        if len(child_text)>0:
			
 
				+            logger.info("extract_type by_tree child_text:%s"%child_text)
			
 
				+            list_result.append(child_text)
			
 
				+
			
 
				         childs = parent_title.get("child_title",[])
			
 
				         if len(childs)>0:
			
 
				 
			
 
				             range_data = get_range_data_by_childs(list_data[_data_i:],childs)
			
 
				             p_text = ""
			
 
				             _find = False
			
 
				+            end_id = id(_data["title_next"]) if isinstance(_data["sentence_title"],dict) and _data["title_next"] is not None and _data["title_next"]["sentence_title"] is not None else None
			
 
				             for pdata in range_data:
			
 
				                 ptext = pdata["text"]
			
 
				                 for p in products:
			
@@ -832,8 +853,11 @@ def extract_parameters_by_tree(_product,products,list_data,_data_i,parent_title,
 
				                     _find = True
			
 
				                 if _find:
			
 
				                     break
			
 
				+                if id(pdata)==end_id:
			
 
				+                    break
			
 
				                 p_text += ptext+"\r\n"
			
 
				             if len(p_text)>0:
			
 
				+                logger.debug("extract_type by parent range_text:%s"%p_text)
			
 
				                 list_result.append(p_text)
			
 
				                 return True
			
 
				     return False
			
@@ -861,8 +885,8 @@ def get_table_pieces(_text,_product,products,list_result,_find):
 
				             if _find:
			
 
				                 list_trs.append(tr)
			
 
				         if len(list_trs)>0:
			
 
				-            logger.debug("extract_type table slices")
			
 
				             table_html = "<table>%s</table>"%("\r\n".join([str(a) for a in list_trs]))
			
 
				+            logger.debug("extract_type table slices %s"%(table_html))
			
 
				             list_result.append(table_html)
			
 
				 
			
 
				 def extract_parameters_by_table(_product,products,_param_pattern,list_data,_data_i,list_result):
			
@@ -887,8 +911,8 @@ def extract_parameters_by_table(_product,products,_param_pattern,list_data,_data
 
				         if re.search(_param_pattern,text_line_first) is not None and text_line_first.find(_product)>=0:
			
 
				             _flag = True
			
 
				         if _flag:
			
 
				-            logger.debug("extract_type add all table %s"%_text)
			
 
				             if len(products)==0:
			
 
				+                logger.debug("extract_type whole table by param and product %s"%(_text))
			
 
				                 list_result.append(_text)
			
 
				             else:
			
 
				                 for p in products:
			
@@ -911,15 +935,26 @@ def extract_parameters_by_table(_product,products,_param_pattern,list_data,_data
 
				             for line in list_table:
			
 
				                 for cell in line:
			
 
				                     cell_text = cell[0]
			
 
				-                    if len(cell_text)>50 and len(re.findall("\d+",cell_text))>10 and cell_text.find(_product)>=0:
			
 
				-                        list_result.append(cell_text)
			
 
				+                    if len(cell_text)>50 and len(re.findall(meter_pattern,cell_text))>5 and cell_text.find(_product)>=0:
			
 
				+                        _f = True
			
 
				+                        for cell in line:
			
 
				+                            if not _f:
			
 
				+                                break
			
 
				+                            cell_text = cell[0]
			
 
				+                            for p in products:
			
 
				+                                if cell_text.find(p)>=0 and p!=_product:
			
 
				+                                    _f = False
			
 
				+                                    break
			
 
				+                        if _f:
			
 
				+                            logger.debug("extract_type param column %s"%(cell_text))
			
 
				+                            list_result.append(cell_text)
			
 
				                     if len(cell_text)<len(_product)*10 and str(cell_text).find(_product)>=0:
			
 
				                         for _index in list_head_index:
			
 
				                             if _index>=len(line):
			
 
				                                 continue
			
 
				                             _cell = line[_index]
			
 
				                             if len(cell[0])>0:
			
 
				-                                logger.info("%s-%s"%("extract_type add on table",_cell[0]))
			
 
				+                                logger.info("%s-%s"%("extract_type add on table text:",_cell[0]))
			
 
				                                 list_result.append(_cell[0])
			
 
				         if not _flag and (re.search(_param_pattern,_text) is not None or (parent_title is not None and re.search(_param_pattern,parent_title["text"]) is not None)) and _text.find(_product)>=0:
			
 
				             get_table_pieces(_text,_product,products,list_result,False)
			
@@ -984,14 +1019,14 @@ def extract_parameters_by_sentence(list_data,_data,_data_i,_product,products,lis
 
				     if re.search(_param_pattern,_text) is not None and len(_text)<50:
			
 
				         childs = _data["child_title"]
			
 
				         if len(childs)>0:
			
 
				-            logger.debug("extract_type sentence %s"%("re.search(_param_pattern,_text) is not None and len(_text)<50:"))
			
 
				             extract_text,_,_ = get_childs_text([_data],_product,products)
			
 
				             if len(extract_text)>0:
			
 
				+                logger.debug("extract_type param-product %s"%(extract_text))
			
 
				                 list_result.append(extract_text)
			
 
				             elif is_project:
			
 
				-                logger.debug("extract_type sentence is_project")
			
 
				                 extract_text,_,_ = get_childs_text([_data],_product,products,is_begin=True)
			
 
				                 if len(extract_text)>0 and re.search(meter_pattern,extract_text) is not None:
			
 
				+                    logger.debug("extract_type sentence is_project param-product is product %s"%(extract_text))
			
 
				                     list_result.append(extract_text)
			
 
				 
			
 
				 def getBestProductText(list_result,_product,products):
			
@@ -1006,7 +1041,7 @@ def getBestProductText(list_result,_product,products):
 
				         _result = list_result[i]
			
 
				         _check = True
			
 
				         _result_text = BeautifulSoup(_result,"html5lib").get_text()
			
 
				-        _search = re.search("项目编号[:：]|项目名称[:：]|联合体投标",_result)
			
 
				+        _search = re.search("项目编号[:：]|项目名称[:：]|联合体投标|开户银行",_result)
			
 
				         if _search is not None:
			
 
				             logger.debug("result%d error illegal text %s"%(i,str(_search)))
			
 
				             _check = False
			
@@ -1015,7 +1050,7 @@ def getBestProductText(list_result,_product,products):
 
				                 if _result_text.find(p)>0 and not (is_similar(_product,p,80) or p.find(_product)>=0 or _product.find(p)>=0):
			
 
				                     logger.debug("result%d error product scoss %s"%(i,p))
			
 
				                     _check = False
			
 
				-        if len(_result_text)<50:
			
 
				+        if len(_result_text)<100:
			
 
				             if re.search(meter_pattern,_result_text) is None:
			
 
				                 logger.debug("result%d error text min count"%(i))
			
 
				                 _check = False
			
@@ -1027,6 +1062,19 @@ def getBestProductText(list_result,_product,products):
 
				                 logger.debug("result%d error text max count less meter"%(i))
			
 
				                 _check = False
			
 
				 
			
 
				+        list_find = list(set(re.findall(meter_pattern,_result_text)))
			
 
				+
			
 
				+        not_list_find = list(set(re.findall(not_meter_pattern,_result_text)))
			
 
				+        _count = len(list_find)-len(not_list_find)
			
 
				+        has_num = False
			
 
				+        for _find in list_find:
			
 
				+            if re.search('[0-9a-zA-Z]',_find) is not None:
			
 
				+                has_num = True
			
 
				+                break
			
 
				+        if not(_count>=2 and has_num or _count>=5):
			
 
				+            logger.debug("result%d error match not enough"%(i))
			
 
				+            _check = False
			
 
				+
			
 
				         if _check:
			
 
				             return _result
			
 
				 
			
@@ -1052,7 +1100,7 @@ def extract_product_parameters(list_data,_product):
 
				         if _type=="sentence":
			
 
				             if _text.find(_product)>=0:
			
 
				                 _find_count += 1
			
 
				-                if re.search("项目名称|采购项目",_text) is not None:
			
 
				+                if re.search("项目名称|采购项目",_text) is not None and re.search("等",_text) is not None:
			
 
				                     is_project = True
			
 
				             extract_parameters_by_sentence(list_data,_data,_data_i,_product,products,list_result,is_project)
			
 
				 
			
@@ -1061,13 +1109,14 @@ def extract_product_parameters(list_data,_product):
 
				                 _find_count += 1
			
 
				             extract_parameters_by_table(_product,products,_param_pattern,list_data,_data_i,list_result)
			
 
				 
			
 
				-    return getBestProductText(list_result,_product,products),_find_count
			
 
				+    _text = getBestProductText(list_result,_product,products)
			
 
				+    return _text,_find_count
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				 
			
 
				-    filepath = "download/fa85b009fad13bd5e48ae1a51d2e3175.html"
			
 
				-    _product = "自动体外除颤器（AED）等"
			
 
				+    filepath = "download/4597dcc128bfabc7584d10590ae50656.html"
			
 
				+    _product = "彩色多普勒超声诊断仪"
			
 
				 
			
 
				     _html = open(filepath, "r", encoding="utf8").read()
			
 
				 
			
@@ -1079,5 +1128,6 @@ if __name__ == '__main__':
 
				 
			
 
				     _text,_count = extract_product_parameters(list_data,_product)
			
 
				     logger.info("find count:%d"%(_count))
			
 
				-    logger.info("extract_text %s"%_text)
			
 
				+    logger.info("extract_parameter_text::%s"%(_text))
			
 
				+
			
 
				 
			
--- a/BaseDataMaintenance/maintenance/product/product_attachment.py
+++ b/BaseDataMaintenance/maintenance/product/product_attachment.py
@@ -11,6 +11,7 @@ from BaseDataMaintenance.model.ots.attachment import *
 
				 from BaseDataMaintenance.common.Utils import *
			
 
				 from BaseDataMaintenance.common.ossUtils import *
			
 
				 from BaseDataMaintenance.maintenance.product.htmlparser import *
			
 
				+from BaseDataMaintenance.maintenance.product.productUtils import pool_product
			
 
				 import oss2
			
 
				 from BaseDataMaintenance.common.multiThread import MultiThreadHandler
			
 
				 
			
@@ -21,6 +22,8 @@ parameter_status_process_failed = 2
 
				 parameter_status_process_jump = 3
			
 
				 parameter_status_not_found = 4
			
 
				 
			
 
				+import redis
			
 
				+
			
 
				 class Product_Attachment_Processor():
			
 
				 
			
 
				     def __init__(self,):
			
@@ -41,6 +44,7 @@ class Product_Attachment_Processor():
 
				         self.bucket = oss2.Bucket(self.auth,self.bucket_url,self.attachment_bucket_name)
			
 
				         self.current_path = os.path.dirname(__file__)
			
 
				         self.download_path = "%s/%s"%(self.current_path,"download")
			
 
				+        self.test_url="http://192.168.2.102:15011/convert"
			
 
				 
			
 
				     def process_parameters_producer(self,):
			
 
				 
			
@@ -52,7 +56,7 @@ class Product_Attachment_Processor():
 
				         list_id = []
			
 
				         rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
			
 
				                                                                             SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("parameter_status")]),limit=100,get_total_count=True),
			
 
				-                                                                            ColumnsToGet([DOCUMENT_PRODUCT_BID_FILEMD5S,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME],return_type=ColumnReturnType.SPECIFIED))
			
 
				+                                                                            ColumnsToGet([DOCUMENT_PRODUCT_ATTACHMENTS,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME],return_type=ColumnReturnType.SPECIFIED))
			
 
				 
			
 
				         list_data = getRow_ots(rows)
			
 
				         for data in list_data:
			
@@ -66,7 +70,7 @@ class Product_Attachment_Processor():
 
				                 break
			
 
				             rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
			
 
				                                                                                 SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
			
 
				-                                                                                ColumnsToGet([DOCUMENT_PRODUCT_BID_FILEMD5S,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME],return_type=ColumnReturnType.SPECIFIED))
			
 
				+                                                                                ColumnsToGet([DOCUMENT_PRODUCT_ATTACHMENTS,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME],return_type=ColumnReturnType.SPECIFIED))
			
 
				             list_data = getRow_ots(rows)
			
 
				             for data in list_data:
			
 
				                 _id = data.get(DOCUMENT_PRODUCT_ID)
			
@@ -76,38 +80,35 @@ class Product_Attachment_Processor():
 
				                 list_id.append(_id)
			
 
				         self.set_product_attachment =  set(list_id)
			
 
				 
			
 
				-    def process_parameters_handler(self,item,result_queue):
			
 
				-        bid_filemd5s = item.get(DOCUMENT_PRODUCT_BID_FILEMD5S)
			
 
				-        product_name = item.get(DOCUMENT_PRODUCT_NAME)
			
 
				-        product_original_name = item.get(DOCUMENT_PRODUCT_ORIGINAL_NAME)
			
 
				-        list_product = []
			
 
				-        if product_name is not None:
			
 
				-            list_product.append(product_name)
			
 
				-        if product_original_name is not None:
			
 
				-            list_product.extend(product_original_name.split("_"))
			
 
				-        list_product = list(set(list_product))
			
 
				-        dp = Document_product(item)
			
 
				-        if bid_filemd5s is None or bid_filemd5s=="" or len(list_product)==0:
			
 
				-            dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_no_bidfile)
			
 
				-            dp.update_row(self.ots_client)
			
 
				-            return
			
 
				-        list_filemd5 = bid_filemd5s.split(",")
			
 
				-        _find = False
			
 
				-        _success = False
			
 
				-        for _filemd5 in list_filemd5:
			
 
				-            if _find:
			
 
				-                break
			
 
				-            atta = attachment({attachment_filemd5:_filemd5})
			
 
				+    def get_whole_html(self,_filemd5):
			
 
				+        atta = attachment({attachment_filemd5:_filemd5})
			
 
				+        _html = ""
			
 
				+
			
 
				+        db = redis.Redis(connection_pool=pool_product)
			
 
				+        _key = "filemd5:%s"%(_filemd5)
			
 
				+
			
 
				+        _cache_html = None
			
 
				+        try:
			
 
				+            _cache_html = db.get(_key)
			
 
				+        except Exception as e:
			
 
				+            logger.info("get redis cache html error")
			
 
				+        
			
 
				+        if _cache_html is not None:
			
 
				+            _html = _cache_html
			
 
				+        else:
			
 
				             if atta.fix_columns(self.ots_client,[attachment_path,attachment_filetype],True):
			
 
				                 objectPath = atta.getProperties().get(attachment_path)
			
 
				                 _filetype = atta.getProperties().get(attachment_filetype)
			
 
				-                if _filetype in ("doc","xls"):
			
 
				-                    if len(list_filemd5)==1:
			
 
				-                        dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_jump,True)
			
 
				-                        dp.update_row(self.ots_client)
			
 
				-                        return
			
 
				-                    else:
			
 
				-                        continue
			
 
				+
			
 
				+                # not supported on windows
			
 
				+                # if _filetype in ("doc","xls"):
			
 
				+                #     if len(list_filemd5)==1:
			
 
				+                #         dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_jump,True)
			
 
				+                #         dp.update_row(self.ots_client)
			
 
				+                #         return
			
 
				+                #     else:
			
 
				+                #         continue
			
 
				+
			
 
				                 localpath = "%s/%s.%s"%(self.download_path,_filemd5,_filetype)
			
 
				                 localhtml = "%s/%s.%s"%(self.download_path,_filemd5,"html")
			
 
				                 download_succeed = False
			
@@ -120,7 +121,7 @@ class Product_Attachment_Processor():
 
				                     download_succeed = False
			
 
				                 if download_succeed:
			
 
				                     try:
			
 
				-                        _html = ""
			
 
				+
			
 
				                         if os.path.exists(localhtml):
			
 
				                             _html = open(localhtml,"r",encoding="utf8").read()
			
 
				                             _success = True
			
@@ -128,79 +129,103 @@ class Product_Attachment_Processor():
 
				                             _success = True
			
 
				                         else:
			
 
				                             _data_base64 = base64.b64encode(open(localpath,"rb").read())
			
 
				-                            _success,_html,swf_images,classification = getAttachDealInterface(_data_base64,_filetype,url="http://192.168.2.102:15011/convert",kwargs={'page_no': '1,-1',"max_bytes":"-1"},timeout=6000)
			
 
				+                            _success,_html,swf_images,classification = getAttachDealInterface(_data_base64,_filetype,kwargs={'page_no': '1,-1',"max_bytes":"-1"},timeout=6000)
			
 
				+
			
 
				                             if _success:
			
 
				-                                localhtml = "%s/%s.%s"%(self.download_path,_filemd5,"html")
			
 
				-                                with open(localhtml,"w",encoding="utf8") as f:
			
 
				-                                    f.write(_html)
			
 
				-                        if _success:
			
 
				-                            if len(_html)>5:
			
 
				-                                pd = ParseDocument(_html,True)
			
 
				-
			
 
				-                                list_text = []
			
 
				-                                for _product in list_product:
			
 
				-                                    pd.fix_tree(_product)
			
 
				-                                    list_data = pd.tree
			
 
				-                                    _text,_count = extract_product_parameters(list_data,_product)
			
 
				-                                    if _count>0:
			
 
				-                                        _find = True
			
 
				-                                    if _text is not None:
			
 
				-                                        list_text.append(_text)
			
 
				-                                pd = ParseDocument(_html,False)
			
 
				-
			
 
				-                                list_text = []
			
 
				-                                for _product in list_product:
			
 
				-                                    pd.fix_tree(_product)
			
 
				-                                    list_data = pd.tree
			
 
				-                                    _text,_count = extract_product_parameters(list_data,_product)
			
 
				-                                    if _count>0:
			
 
				-                                        _find = True
			
 
				-                                    if _text is not None:
			
 
				-                                        list_text.append(_text)
			
 
				-                                if len(list_text)>0:
			
 
				-                                    list_text.sort(key=lambda x:len(re.findall('[:：;；]',BeautifulSoup(x,"html5lib").get_text())), reverse=True)
			
 
				-                                    _text = list_text[0]
			
 
				-                                    _success = True
			
 
				-                                    dp.setValue(DOCUMENT_PRODUCT_PARAMETER,_text,True)
			
 
				-                                    dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_succeed,True)
			
 
				-                                    dp.update_row(self.ots_client)
			
 
				-                                    return
			
 
				-                            else:
			
 
				-                                log("product attachment process filemd5 %s has no content"%(_filemd5))
			
 
				+                                db.set(_key,_html,24*60*60)
			
 
				+                                # save for dubug
			
 
				+                                # localhtml = "%s/%s.%s"%(self.download_path,_filemd5,"html")
			
 
				+                                # with open(localhtml,"w",encoding="utf8") as f:
			
 
				+                                #     f.write(_html)
			
 
				+
			
 
				                     except Exception as e:
			
 
				                         traceback.print_exc()
			
 
				                     finally:
			
 
				                         try:
			
 
				-                            # if os.path.exists(localpath):
			
 
				-                            #     os.remove(localpath)
			
 
				+                            if os.path.exists(localpath):
			
 
				+                                os.remove(localpath)
			
 
				                             pass
			
 
				                         except Exception as e:
			
 
				                             pass
			
 
				+        return _html
			
 
				+
			
 
				+    def process_parameters_handler(self,item,result_queue):
			
 
				+        attachments = item.get(DOCUMENT_PRODUCT_ATTACHMENTS)
			
 
				+        product_name = item.get(DOCUMENT_PRODUCT_NAME)
			
 
				+        product_original_name = item.get(DOCUMENT_PRODUCT_ORIGINAL_NAME)
			
 
				+        list_product = []
			
 
				+        if product_original_name is not None:
			
 
				+            _l = product_original_name.split("_")
			
 
				+            _l.reverse()
			
 
				+            list_product.extend(_l)
			
 
				+        if product_name is not None:
			
 
				+            list_product.append(product_name)
			
 
				+        list_product = list(set(list_product))
			
 
				+        dp = Document_product(item)
			
 
				+        if attachments is None or attachments=="" or len(list_product)==0:
			
 
				+            dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_no_bidfile)
			
 
				+            dp.update_row(self.ots_client)
			
 
				+            return
			
 
				+        list_attachment = json.loads(attachments)
			
 
				+        list_filemd5 = [a.get("filemd5","") for a in list_attachment]
			
 
				+        _find = False
			
 
				+        _success = False
			
 
				+        list_text = []
			
 
				+        for _filemd5 in list_filemd5:
			
 
				+            _html = self.get_whole_html(_filemd5)
			
 
				+            if len(_html)>5:
			
 
				+
			
 
				+                pd = ParseDocument(_html,True)
			
 
				+                for _product in list_product:
			
 
				+                    pd.fix_tree(_product)
			
 
				+                    list_data = pd.tree
			
 
				+                    _text,_count = extract_product_parameters(list_data,_product)
			
 
				+                    if _count>0:
			
 
				+                        _find = True
			
 
				+                    if _text is not None:
			
 
				+                        list_text.append(_text)
			
 
				 
			
 
				-        if not _find:
			
 
				-            dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_not_found,True)
			
 
				+                pd = ParseDocument(_html,False)
			
 
				+                for _product in list_product:
			
 
				+                    pd.fix_tree(_product)
			
 
				+                    list_data = pd.tree
			
 
				+                    _text,_count = extract_product_parameters(list_data,_product)
			
 
				+                    if _count>0:
			
 
				+                        _find = True
			
 
				+                    if _text is not None:
			
 
				+                        list_text.append(_text)
			
 
				+            else:
			
 
				+                log("product attachment process filemd5 %s has no content"%(_filemd5))
			
 
				+
			
 
				+        if len(list_text)>0:
			
 
				+            _text = getBestProductText(list_text,'',[])
			
 
				+            logger.info("extract_parameter_text bid_filemd5s:%s name:%s original_name:%s parameter_text:%s"%(str(list_filemd5),product_name,product_original_name,_text))
			
 
				+            dp.setValue(DOCUMENT_PRODUCT_PARAMETER,_text,True)
			
 
				+            dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_succeed,True)
			
 
				             dp.update_row(self.ots_client)
			
 
				         else:
			
 
				-            dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_failed,True)
			
 
				-            dp.update_row(self.ots_client)
			
 
				+            if not _find:
			
 
				+                dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_not_found,True)
			
 
				+                dp.update_row(self.ots_client)
			
 
				+            else:
			
 
				+                dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_failed,True)
			
 
				+                dp.update_row(self.ots_client)
			
 
				 
			
 
				     def start_process(self):
			
 
				-        mt = MultiThreadHandler(self.product_attachment_queue,self.process_parameters_handler,None,3,need_stop=False,restart=True)
			
 
				+        mt = MultiThreadHandler(self.product_attachment_queue,self.process_parameters_handler,None,2,need_stop=False,restart=True)
			
 
				         mt.run()
			
 
				 
			
 
				     def process_parameters_comsumer(self,):
			
 
				+        process_count = 2
			
 
				+        list_process = []
			
 
				+        for i in range(process_count):
			
 
				+            p = Process(target=self.start_process)
			
 
				+            list_process.append(p)
			
 
				+        for p in list_process:
			
 
				+            p.start()
			
 
				+        for p in list_process:
			
 
				+            p.join()
			
 
				 
			
 
				-        # process_count = 2
			
 
				-        # list_process = []
			
 
				-        # for i in range(process_count):
			
 
				-        #     p = Process(target=self.start_process)
			
 
				-        #     list_process.append(p)
			
 
				-        # for p in list_process:
			
 
				-        #     p.start()
			
 
				-        # for p in list_process:
			
 
				-        #     p.join()
			
 
				-
			
 
				-        self.start_process()
			
 
				 
			
 
				     def start_process_parameters(self):
			
 
				         scheduler = BlockingScheduler()
			
--- a/BaseDataMaintenance/maxcompute/documentDumplicate.py
+++ b/BaseDataMaintenance/maxcompute/documentDumplicate.py
--- a/BaseDataMaintenance/model/ots/document.py
+++ b/BaseDataMaintenance/model/ots/document.py
@@ -321,11 +321,15 @@ def turn_document_status():
 
				         #     # must_not_queries=[WildcardQuery("DX004354*")]
			
 
				         # )
			
 
				         bool_query = BoolQuery(
			
 
				-            must_queries=[
			
 
				-                RangeQuery("crtime","2023-08-30 15:00:00","2023-08-30 23:59:59"),
			
 
				-                NestedQuery("page_attachments",ExistsQuery("page_attachments.fileMd5"))
			
 
				-            ],
			
 
				-            must_not_queries=[WildcardQuery("attachmenttextcon","*")]
			
 
				+            # must_queries=[
			
 
				+            #     RangeQuery("crtime","2023-08-30 15:00:00","2023-08-30 23:59:59"),
			
 
				+            #     NestedQuery("page_attachments",ExistsQuery("page_attachments.fileMd5"))
			
 
				+            # ],
			
 
				+            # must_not_queries=[WildcardQuery("attachmenttextcon","*")],
			
 
				+            should_queries=[
			
 
				+                NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer","个体工商户")),
			
 
				+                NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer","机械设备")),
			
 
				+            ]
			
 
				 
			
 
				         )
			
 
				 
			
@@ -337,9 +341,7 @@ def turn_document_status():
 
				         _count = len(list_data)
			
 
				         for _data in list_data:
			
 
				             _document = Document(_data)
			
 
				-            _attachment = _data.get(document_attachmenttextcon,"")
			
 
				-            if _attachment=="":
			
 
				-                task_queue.put(_document)
			
 
				+            task_queue.put(_document)
			
 
				         while next_token:
			
 
				             rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
			
 
				                                                                            SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
			
@@ -349,9 +351,7 @@ def turn_document_status():
 
				             print("%d/%d"%(_count,total_count))
			
 
				             for _data in list_data:
			
 
				                 _document = Document(_data)
			
 
				-                _attachment = _data.get(document_attachmenttextcon,"")
			
 
				-                if _attachment=="":
			
 
				-                    task_queue.put(_document)
			
 
				+                task_queue.put(_document)
			
 
				 
			
 
				         # docids = [223820830,224445409]
			
 
				         # for docid in docids: