1 rok temu · 644768dabc
--- a/BaseDataMaintenance/maintenance/dataflow.py
+++ b/BaseDataMaintenance/maintenance/dataflow.py
@@ -499,7 +499,7 @@ class Dataflow():
 
				             return _split
			
 
				         return []
			
 
				 
			
 
				-    def search_data_by_query(self,item,_query,confidence,table_name="document_tmp",table_index="document_tmp_index",sort_column="docid",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count]):
			
 
				+    def search_data_by_query(self,item,_query,confidence,table_name="document_tmp",table_index="document_tmp_index",sort_column="docid",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count,document_tmp_doctitle]):
			
 
				 
			
 
				         list_data = []
			
 
				         if isinstance(_query,list):
			
@@ -2205,6 +2205,8 @@ class Dataflow_dumplicate(Dataflow):
 
				         else:
			
 
				             _dict["project_code"] = ""
			
 
				         _dict["doctitle_refine"] = _extract.get("doctitle_refine","")
			
 
				+        if _dict["doctitle_refine"]=="":
			
 
				+            _dict["doctitle_refine"] = _dict.get("doctitle")
			
 
				         _dict["nlp_enterprise"] = str({"indoctextcon":_extract.get("nlp_enterprise",[]),
			
 
				                                        "notindoctextcon":_extract.get("nlp_enterprise_attachment",[])})
			
 
				         _dict["extract_count"] = self.c_f_get_extractCount.evaluate(extract_json)
			
@@ -2256,7 +2258,7 @@ class Dataflow_dumplicate(Dataflow):
 
				             return the_group[:_index+1]
			
 
				         return []
			
 
				 
			
 
				-    def dumplicate_check(self,_dict1,_dict2,min_counts,b_log=False):
			
 
				+    def dumplicate_check(self,_dict1,_dict2,min_counts,b_log=True):
			
 
				         document_less = _dict1
			
 
				         docid_less = _dict1["docid"]
			
 
				         docchannel_less = document_less["docchannel"]
			
@@ -3894,7 +3896,7 @@ class Dataflow_dumplicate(Dataflow):
 
				                 singleNum_keys = _rule["singleNum_keys"]
			
 
				                 contain_keys = _rule["contain_keys"]
			
 
				                 multiNum_keys = _rule["multiNum_keys"]
			
 
				-                self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district])
			
 
				+                self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle])
			
 
				                 _i += step
			
 
				 
			
 
				 
			
@@ -4173,7 +4175,7 @@ if __name__ == '__main__':
 
				     df_dump = Dataflow_dumplicate(start_delete_listener=False)
			
 
				     # df_dump.start_flow_dumplicate()
			
 
				     a = time.time()
			
 
				-    df_dump.test_dumplicate(349638765)
			
 
				+    df_dump.test_dumplicate(339737931)
			
 
				     # df_dump.test_merge([292315564],[287890754])
			
 
				     # df_dump.flow_remove_project_tmp()
			
 
				     print("takes",time.time()-a)
			
--- a/BaseDataMaintenance/maintenance/dataflow_mq.py
+++ b/BaseDataMaintenance/maintenance/dataflow_mq.py
@@ -109,7 +109,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
 
				                                       "html":_html})
			
 
				                 else:
			
 
				                     #has process_time then jump
			
 
				-                    if len(str(_attach.getProperties().get(attachment_process_time,"")))>10 and _attach.getProperties().get(attachment_status)!=ATTACHMENT_INIT:
			
 
				+                    if len(str(_attach.getProperties().get(attachment_process_time,"")))>10 and _attach.getProperties().get(attachment_status)!=ATTACHMENT_INIT and not (_attach.getProperties().get(attachment_status)>=ATTACHMENT_MC_FAILED_FROM and _attach.getProperties().get(attachment_status)<=ATTACHMENT_MC_FAILED_TO):
			
 
				                         log("%s has process_time jump"%(_filemd5))
			
 
				                         _html = _attach.getProperties().get(attachment_attachmenthtml,"")
			
 
				                         if _html is None:
			
@@ -161,9 +161,11 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
 
				 
			
 
				             _dochtmlcon = item.get(document_tmp_dochtmlcon,"")
			
 
				             dhtml.setValue(document_tmp_dochtmlcon,_dochtmlcon,True)
			
 
				+            dhtml.delete_bidi_a()
			
 
				             dtmp = Document_tmp(item)
			
 
				 
			
 
				 
			
 
				+            start_time = time.time()
			
 
				             #调用识别接口
			
 
				             _succeed,list_html,swf_urls = self.rec_attachments_by_interface(list_attach,_dochtmlcon,save=True)
			
 
				 
			
@@ -208,6 +210,9 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
 
				                 ackMsg(conn,message_id)
			
 
				             log("document:%d get attachments with result:%s %s retry_times:%d"%(item.get("docid"),str(_succeed),str(_to_ack),_retry_times))
			
 
				         except Exception as e:
			
 
				+            traceback.print_exc()
			
 
				+            if time.time()-start_time<10:
			
 
				+                item["retry_times"] -= 1
			
 
				             if send_msg_toacmq(self.pool_mq,json.dumps(item,cls=MyEncoder,ensure_ascii=False),self.mq_attachment):
			
 
				                 ackMsg(conn,message_id)
			
 
				 
			
@@ -1023,6 +1028,7 @@ class Dataflow_init(Dataflow):
 
				             self.get_count = 1000
			
 
				             self.count = self.get_count
			
 
				             self.begin_docid = None
			
 
				+            self.mq_init = "/queue/dataflow_init"
			
 
				             self.mq_attachment = "/queue/dataflow_attachment"
			
 
				             self.mq_extract = "/queue/dataflow_extract"
			
 
				             self.pool_mq1 = ConnectorPool(1,4,getConnect_activateMQ)
			
@@ -1043,32 +1049,38 @@ class Dataflow_init(Dataflow):
 
				             return next_docid
			
 
				 
			
 
				         def on_message(self, headers):
			
 
				-            next_docid = int(self.getNextDocid())
			
 
				-            partitionkey = int(next_docid%500+1)
			
 
				-            message_id = headers.headers["message-id"]
			
 
				-            body = json.loads(headers.body)
			
 
				-            body[document_tmp_partitionkey] = partitionkey
			
 
				-            body[document_tmp_docid] = next_docid
			
 
				-            if body.get(document_original_docchannel) is None:
			
 
				-                body[document_original_docchannel] = body.get(document_docchannel)
			
 
				-            page_attachments = body.get(document_tmp_attachment_path,"[]")
			
 
				-            _uuid = body.get(document_tmp_uuid,"")
			
 
				-            if page_attachments!="[]":
			
 
				-                status = random.randint(1,10)
			
 
				-                body[document_tmp_status] = status
			
 
				-                if send_msg_toacmq(self.pool_mq1,json.dumps(body,cls=MyEncoder),self.mq_attachment):
			
 
				-                    log("uuid:%s with docid:%s"%(str(_uuid),str(next_docid)))
			
 
				-                    ackMsg(self.conn,message_id)
			
 
				+            try:
			
 
				+                next_docid = int(self.getNextDocid())
			
 
				+                partitionkey = int(next_docid%500+1)
			
 
				+                message_id = headers.headers["message-id"]
			
 
				+                body = json.loads(headers.body)
			
 
				+                body[document_tmp_partitionkey] = partitionkey
			
 
				+                body[document_tmp_docid] = next_docid
			
 
				+                if body.get(document_original_docchannel) is None:
			
 
				+                    body[document_original_docchannel] = body.get(document_docchannel)
			
 
				+                page_attachments = body.get(document_tmp_attachment_path,"[]")
			
 
				+                _uuid = body.get(document_tmp_uuid,"")
			
 
				+                if page_attachments!="[]":
			
 
				+                    status = random.randint(1,10)
			
 
				+                    body[document_tmp_status] = status
			
 
				+                    if send_msg_toacmq(self.pool_mq1,json.dumps(body,cls=MyEncoder),self.mq_attachment):
			
 
				+                        log("uuid:%s with docid:%s"%(str(_uuid),str(next_docid)))
			
 
				+                        ackMsg(self.conn,message_id)
			
 
				+                    else:
			
 
				+                        log("send_msg_error on init listener")
			
 
				                 else:
			
 
				-                    log("send_msg_error on init listener")
			
 
				-            else:
			
 
				-                status = random.randint(11,50)
			
 
				-                body[document_tmp_status] = status
			
 
				-                if send_msg_toacmq(self.pool_mq1,json.dumps(body,cls=MyEncoder),self.mq_extract):
			
 
				-                    log("uuid:%s with docid:%s"%(str(_uuid),str(next_docid)))
			
 
				+                    status = random.randint(11,50)
			
 
				+                    body[document_tmp_status] = status
			
 
				+                    if send_msg_toacmq(self.pool_mq1,json.dumps(body,cls=MyEncoder),self.mq_extract):
			
 
				+                        log("uuid:%s with docid:%s"%(str(_uuid),str(next_docid)))
			
 
				+                        ackMsg(self.conn,message_id)
			
 
				+                    else:
			
 
				+                        log("send_msg_error on init listener")
			
 
				+            except Exception as e:
			
 
				+                traceback.print_exc()
			
 
				+                if send_msg_toacmq(self.pool_mq1,json.dumps(body,cls=MyEncoder),self.mq_init):
			
 
				+                    log("init error")
			
 
				                     ackMsg(self.conn,message_id)
			
 
				-                else:
			
 
				-                    log("send_msg_error on init listener")
			
 
				 
			
 
				         def __del__(self):
			
 
				             self.conn.disconnect()
			
--- a/BaseDataMaintenance/maintenance/product/1.py
+++ b/BaseDataMaintenance/maintenance/product/1.py
@@ -1,15 +1,16 @@
 
				-
			
 
				+#coding:utf8
			
 
				 
			
 
				 import re
			
 
				-pattern="(^|★|:|：|\s+)(?P<title_1>(?P<title_1_index_0_0>第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章册部\.:：]))|" \
			
 
				-        "([\s★\*]*)(?P<title_3>(?P<title_3_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?)(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_3_index_0_2>))|" \
			
 
				-        "([\s★\*]*)(?P<title_4>(?P<title_4_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节章册部\.:：、、]))|" \
			
 
				-        "([\s★\*]*)(?P<title_11>(?P<title_11_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\.．、\s\-]?))|" \
			
 
				-        "([\s★\*]*)(?P<title_10>(?P<title_10_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\.．、\s\-]?))|" \
			
 
				-        "([\s★\*]*)(?P<title_7>(?P<title_7_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\.．、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\.．、\s\-]?))|" \
			
 
				-        "([\s★\*]*)(?P<title_6>(?P<title_6_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?包?)(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_2_0>[\.．、\s\-]?))|" \
			
 
				-        "([\s★\*]*)(?P<title_15>(?P<title_15_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?（?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>）))|" \
			
 
				-        "([\s★\*]*)(?P<title_17>(?P<title_17_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?（?)(?P<title_17_index_1_1>[a-wA-W]+)(?P<title_17_index_2_0>）))|" \
			
 
				-        "([\s★\*]*)(?P<title_19>(?P<title_19_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[(（]?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>[)）]))" \
			
 
				-        ""
			
 
				-print(re.search(pattern,"（一）4K内窥镜荧光摄像系统主机").groupdict())
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+p = '''
			
 
				+包名称：包B：电脑恒温电蜡疗仪，全自动红外母乳分析仪，生物反馈治疗仪，磁刺激仪，多参数生物反馈仪、婴幼儿养育照护指导中心综合管理平台、多功能婴儿培养箱供应商名称：济南旭博医疗设备有限公司
			
 
				+<table border="1"><tbody><tr><td colspan="1">货物名称</td><td colspan="1">品牌</td><td colspan="1">产地</td><td colspan="1">规格要求</td><td colspan="1">单价(元)/优惠率</td><td colspan="1">数量/单位</td></tr><tr><td colspan="1">婴幼儿养育照护指导中心综合管理平台</td><td colspan="1">北京零六</td><td colspan="1">北京/北京零六爱成长健康科技有限公司</td><td colspan="1">爱成长</td><td colspan="1">220000.000000</td><td colspan="1">1套</td></tr><tr><td colspan="1">电脑恒温电蜡疗仪</td><td colspan="1">苏州好博 </td><td colspan="1">苏州/苏州好博医疗器械股份有限公司</td><td colspan="1">HB-LY3</td><td colspan="1">104000.000000</td><td colspan="1">1台</td></tr><tr><td colspan="1">全自动红外母乳分析仪</td><td colspan="1">泰安康宇</td><td colspan="1">泰安/泰安市康宇医疗器械有限公司</td><td colspan="1">KY-9002</td><td colspan="1">200000.000000</td><td colspan="1">1台</td></tr><tr><td colspan="1">多功能婴儿培养箱</td><td colspan="1">宁波戴维</td><td colspan="1">宁波/宁波戴维医疗器械股份有限公司</td><td colspan="1">YP-3000</td><td colspan="1">302000.000000</td><td colspan="1">2台</td></tr><tr><td colspan="1">多参数生物反馈仪</td><td colspan="1">南京伟思</td><td colspan="1">南京/南京伟思医疗科技股份有限公司</td><td colspan="1">Infiniti3000C</td><td colspan="1">220000.000000</td><td colspan="1">1台</td></tr><tr><td colspan="1">生物反馈治疗仪</td><td colspan="1">南京锐诗得</td><td colspan="1">南京/南京锐诗得医疗科技有限公司</td><td colspan="1">RSD RM4</td><td colspan="1">87000.000000</td><td colspan="1">1台</td></tr><tr><td colspan="1">磁刺激仪</td><td colspan="1">南京伟思</td><td colspan="1">南京/南京伟思医疗科技股份有限公司</td><td colspan="1">Magneuro100HZ</td><td colspan="1">355000.000000</td><td colspan="1">1台</td></tr></tbody></table>
			
 
				+
			
 
				+'''
			
 
				+_text = BeautifulSoup(p,"html5lib").get_text()
			
 
				+print(_text)
			
 
				+meter_pattern = "[><≤≥±]\d+|\d+(?:[μucmkK微毫千]?[米升LlgGmMΩ]|摄氏度|英寸|度|天|VA|dB|bpm|rpm|kPa|mol|cmH20|%|°|Mpa|Hz|K?HZ|℃|W|min|[*×xX])|[*×xX]\d+|/min|\ds[^a-zA-Z]|GB.{,20}标准|PVC|PP|角度|容积|色彩|自动|流量|外径|轴位|折射率|帧率|柱镜|振幅|磁场|镜片|防漏|强度|允差|心率|倍数|瞳距|底座|色泽|噪音|间距|材质|材料|表面|频率|阻抗|浓度|兼容|防尘|防水|内径|实时|一次性|误差|性能|距离|精确|温度|超温|范围|跟踪|对比度|亮度|[横纵]向|均压|负压|正压|可调|设定值|功能|检测|高度|厚度|宽度|深度|[单双多]通道|效果|指数|模式|尺寸|重量|峰值|谷值|容量|寿命|稳定性|高温|信号|电源|电流|转换率|效率|释放量|转速|离心力|向心力|弯曲|电压|功率|气量|国标|标准协议|灵敏度|最大值|最小值|耐磨|波形|高压|性强|工艺|光源|低压|压力|压强|速度|湿度|重量|毛重|[MLX大中小]+码|净重|颜色|[红橙黄绿青蓝紫]色|不锈钢|输入|输出|噪声|认证|配置"
			
 
				+not_meter_pattern = "投标报价|中标金额|商务部分|公章|分值构成|业绩|详见|联系人|联系电话|合同价|金额|采购预算|资金来源|费用|质疑|评审因素|评审标准|商务资信|商务评分|总价|专家论证意见|评标方法|代理服务费|售后服务|邮政编码|评分类型|评分项目|预算金额|得\d+分|项目金额|详见招标文件|乙方|甲方|合同|报价|采购人|技术支持服务"
			
 
				+print(list(set(re.findall(meter_pattern,_text))))
			
 
				+print(list(set(re.findall(not_meter_pattern,_text))))
			
--- a/BaseDataMaintenance/maintenance/product/htmlparser.py
+++ b/BaseDataMaintenance/maintenance/product/htmlparser.py
@@ -2,22 +2,22 @@
 
				 
			
 
				 import re
			
 
				 
			
 
				-from BaseDataMaintenance.maintenance.product.productUtils import *
			
 
				+from BaseDataMaintenance.maintenance.product.productUtils import is_similar
			
 
				 import logging
			
 
				 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				 
			
 
				 logger = logging.getLogger(__name__)
			
 
				-logger.setLevel(logging.DEBUG)
			
 
				-
			
 
				-
			
 
				+logger.setLevel(logging.INFO)
			
 
				 
			
 
				 
			
 
				 from bs4 import BeautifulSoup
			
 
				 import copy
			
 
				 
			
 
				 end_pattern = "商务要求|评分标准|商务条件|商务条件"
			
 
				-_param_pattern = "(产品|技术|清单[及和]?|配置|参数|具体|明细[及和]?|项目|货物|服务)(指标|配置|要求|参数|需求|规格)|配置清单|(质量|技术).{,10}要求|验收标准|^参数$"
			
 
				-meter_pattern = "角度|容积|色彩|帧率|磁场|强度|允差|噪音|材质|频率|阻抗|浓度|范围|误差|精确|温度|可调|设定值|功能|检测|高度|宽度|模式|尺寸|重量|峰值|容量|寿命|稳定性|高温|电源|电压|功率|压力|压强"
			
 
				+_param_pattern = "(产品|技术|清单|配置|参数|具体|明细|项目|招标|货物|服务|规格|工作|具体)[及和与]?(指标|配置|条件|要求|参数|需求|规格|条款|名称及要求)|配置清单|(质量|技术).{,10}要求|验收标准|^(参数|功能)$"
			
 
				+meter_pattern = "[><≤≥±]\d+|\d+(?:[μucmkK微毫千]?[米升LlgGmMΩ]|摄氏度|英寸|度|天|VA|dB|bpm|rpm|kPa|mol|cmH20|%|°|Mpa|Hz|K?HZ|℃|W|min|[*×xX])|[*×xX]\d+|/min|\ds[^a-zA-Z]|GB.{,20}标准|PVC|PP|角度|容积|色彩|自动|流量|外径|轴位|折射率|帧率|柱镜|振幅|磁场|镜片|防漏|强度|允差|心率|倍数|瞳距|底座|色泽|噪音|间距|材质|材料|表面|频率|阻抗|浓度|兼容|防尘|防水|内径|实时|一次性|误差|性能|距离|精确|温度|超温|范围|跟踪|对比度|亮度|[横纵]向|均压|负压|正压|可调|设定值|功能|检测|高度|厚度|宽度|深度|[单双多]通道|效果|指数|模式|尺寸|重量|峰值|谷值|容量|寿命|稳定性|高温|信号|电源|电流|转换率|效率|释放量|转速|离心力|向心力|弯曲|电压|功率|气量|国标|标准协议|灵敏度|最大值|最小值|耐磨|波形|高压|性强|工艺|光源|低压|压力|压强|速度|湿度|重量|毛重|[MLX大中小]+码|净重|颜色|[红橙黄绿青蓝紫]色|不锈钢|输入|输出|噪声|认证|配置"
			
 
				+not_meter_pattern = "投标报价|中标金额|商务部分|公章|分值构成|业绩|详见|联系人|联系电话|合同价|金额|采购预算|资金来源|费用|质疑|评审因素|评审标准|商务资信|商务评分|专家论证意见|评标方法|代理服务费|售后服务|评分类型|评分项目|预算金额|得\d+分|项目金额|详见招标文件|乙方"
			
 
				+
			
 
				 
			
 
				 def getTrs(tbody):
			
 
				     #获取所有的tr
			
@@ -128,7 +128,7 @@ class ParseDocument():
 
				         _body = self.soup.find("body")
			
 
				         if _body is not None:
			
 
				             self.soup = _body
			
 
				-        self.list_obj = self.soup.find_all(recursive=False)
			
 
				+        self.list_obj = self.get_soup_objs(self.soup)
			
 
				 
			
 
				         # for obj in self.list_obj:
			
 
				         #     print("obj",obj.get_text()[:20])
			
@@ -140,6 +140,18 @@ class ParseDocument():
 
				         # if self.parseTree:
			
 
				         #     self.parseTree.printParseTree()
			
 
				 
			
 
				+    def get_soup_objs(self,soup,list_obj=None):
			
 
				+        if list_obj is None:
			
 
				+            list_obj = []
			
 
				+        childs = soup.find_all(recursive=False)
			
 
				+        for _obj in childs:
			
 
				+            childs1 = _obj.find_all(recursive=False)
			
 
				+            if len(childs1)==0 or len(_obj.get_text())<40 or _obj.name=="table":
			
 
				+                list_obj.append(_obj)
			
 
				+            else:
			
 
				+                self.get_soup_objs(_obj,list_obj)
			
 
				+        return list_obj
			
 
				+
			
 
				     def fix_tree(self,_product):
			
 
				         products = extract_products(self.tree,_product)
			
 
				         if len(products)>0:
			
@@ -148,12 +160,16 @@ class ParseDocument():
 
				     def print_tree(self,tree,append=""):
			
 
				         if append=="":
			
 
				             self.set_tree_id = set()
			
 
				+
			
 
				+            # for t in tree:
			
 
				+            #     logger.debug("%s text:%s title:%s title_text:%s before:%s after%s product:%s"%("==>",t["text"][:50],t["sentence_title"],t["sentence_title_text"],t["title_before"],t["title_after"],t["has_product"]))
			
 
				+
			
 
				         for t in tree:
			
 
				             _id = id(t)
			
 
				             if _id in self.set_tree_id:
			
 
				                 continue
			
 
				             self.set_tree_id.add(_id)
			
 
				-            logger.debug("%s %s %s %s %s"%(append,t["text"][:50],t["sentence_title"],t["title_before"],t["title_after"]))
			
 
				+            logger.debug("%s text:%s title:%s title_text:%s before:%s after%s product:%s"%(append,t["text"][:50],t["sentence_title"],t["sentence_title_text"],t["title_before"],t["title_after"],t["has_product"]))
			
 
				             childs = t["child_title"]
			
 
				             self.print_tree(childs,append=append+"-|")
			
 
				 
			
@@ -162,18 +178,18 @@ class ParseDocument():
 
				             return True
			
 
				         return False
			
 
				 
			
 
				-    def find_title_by_pattern(self,_text,_pattern="(^|★|▲|:|：|\s+)(?P<title_1>(?P<title_1_index_0_0>第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章册包标部\.:：]))|" \
			
 
				-                                             "([\s★▲\*]*)(?P<title_3>(?P<title_3_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?)(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_3_index_0_2>))|" \
			
 
				-                                             "([\s★▲\*]*)(?P<title_4>(?P<title_4_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节章册部\.:：、、]))|" \
			
 
				+    def find_title_by_pattern(self,_text,_pattern="(^|★|▲|:|：|\s+)(?P<title_1>(?P<title_1_index_0_0>第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章册包标部.:：、、]+))|" \
			
 
				+                                             "([\s★▲\*]*)(?P<title_3>(?P<title_3_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?)(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_3_index_0_2>[、章册包标部.:：、、]+))|" \
			
 
				+                                             "([\s★▲\*]*)(?P<title_4>(?P<title_4_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节章册部\.:：、、]+))|" \
			
 
				                                              "([\s★▲\*]*)(?P<title_5>(?P<title_5_index_0_0>^)(?P<title_5_index_1_1>[一二三四五六七八九十]+)(?P<title_5_index_2_0>)[^一二三四五六七八九十节章册部\.:：、、])|" \
			
 
				                                              "([\s★▲\*]*)(?P<title_12>(?P<title_12_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-])(?P<title_12_index_1_1>\d{1,2})(?P<title_12_index_2_0>[\.．、\s\-]?))|"\
			
 
				                                              "([\s★▲\*]*)(?P<title_11>(?P<title_11_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\.．、\s\-]?))|" \
			
 
				                                              "([\s★▲\*]*)(?P<title_10>(?P<title_10_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\.．、\s\-]?))|" \
			
 
				-                                             "([\s★▲\*]*)(?P<title_7>(?P<title_7_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\.．\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\.．包标:：、\s\-]?))|" \
			
 
				-                                             "([\s★▲\*]*)(?P<title_6>(?P<title_6_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?包?)(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_2_0>[\.．、\s\-包标]?))|" \
			
 
				-                                             "([\s★▲\*]*)(?P<title_15>(?P<title_15_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[(（]?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>[)）包标]))|" \
			
 
				-                                             "([\s★▲\*]*)(?P<title_17>(?P<title_17_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[(（]?)(?P<title_17_index_1_1>[a-wA-W]+)(?P<title_17_index_2_0>[)）包标]))|" \
			
 
				-                                             "([\s★▲\*]*)(?P<title_19>(?P<title_19_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[(（]?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>[)）]))" \
			
 
				+                                             "([\s★▲\*]*)(?P<title_7>(?P<title_7_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\.．\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\.．包标:：、\s\-]*))|" \
			
 
				+                                             "(^[\s★▲\*]*)(?P<title_6>(?P<title_6_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?包?)(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_2_0>[\.．、\s\-包标]*))|" \
			
 
				+                                             "([\s★▲\*]*)(?P<title_15>(?P<title_15_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[(（]?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>[)）包标\.．:：、]+))|" \
			
 
				+                                             "([\s★▲\*]*)(?P<title_17>(?P<title_17_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[(（]?)(?P<title_17_index_1_1>[a-zA-Z]+)(?P<title_17_index_2_0>[)）包标\.．:：、]+))|" \
			
 
				+                                             "([\s★▲\*]*)(?P<title_19>(?P<title_19_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[(（]?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>[)）]))"
			
 
				                               ):
			
 
				         _se = re.search(_pattern,_text)
			
 
				         groups = []
			
@@ -324,6 +340,7 @@ class ParseDocument():
 
				             max_length = max(list_length)
			
 
				         else:
			
 
				             max_length = 40
			
 
				+        max_length = min(max_length,40)
			
 
				 
			
 
				         logger.debug("%s:%d"%("max_length",max_length))
			
 
				 
			
@@ -335,6 +352,9 @@ class ParseDocument():
 
				         dict_before,illegal_sentence = self.count_title_before(list_obj)
			
 
				         for obj_i in range(len(list_obj)):
			
 
				             obj = list_obj[obj_i]
			
 
				+
			
 
				+            # logger.debug("==obj %s"%obj.text[:20])
			
 
				+
			
 
				             _type = "sentence"
			
 
				             _text = standard_product(obj.text)
			
 
				             if obj.name=="table":
			
@@ -355,36 +375,62 @@ class ParseDocument():
 
				             list_table = None
			
 
				             block = False
			
 
				 
			
 
				+            has_product = False
			
 
				+
			
 
				             if _type=="sentence":
			
 
				                 if _text in illegal_sentence:
			
 
				                     continue
			
 
				 
			
 
				-                _fix = False
			
 
				-                for p in products:
			
 
				-                    if re.sub("^(\d[.、]?)+","",_text.strip())==p:
			
 
				-                        title_before = "=产品"
			
 
				-                        sentence_title = "title_0"
			
 
				-                        sentence_title_text = p
			
 
				-                        title_index = "0"
			
 
				-                        title_after = "产品="
			
 
				-                        next_index = "0"
			
 
				-                        _fix = True
			
 
				-                        break
			
 
				 
			
 
				-                if not _fix:
			
 
				-                    sentence_groups = self.find_title_by_pattern(_text[:10])
			
 
				-                    if sentence_groups:
			
 
				-                        title_before = standard_title_context(sentence_groups[1][1])
			
 
				-                        if title_before in dict_before and dict_before[title_before]>1:
			
 
				-                            sentence_title = sentence_groups[0][0]
			
 
				-                            sentence_title_text = sentence_groups[0][1]
			
 
				-                            title_index = sentence_groups[-2][1]
			
 
				-
			
 
				-                            title_after = sentence_groups[-1][1]
			
 
				-                            next_index = self.get_next_title(title_index)
			
 
				-                        else:
			
 
				-                            title_before = None
			
 
				+                sentence_groups = self.find_title_by_pattern(_text[:10])
			
 
				+                if sentence_groups:
			
 
				+                    title_before = standard_title_context(sentence_groups[1][1])
			
 
				+                    title_after = sentence_groups[-1][1]
			
 
				+                    sentence_title_text = sentence_groups[0][1]
			
 
				+                    other_text = _text.replace(sentence_title_text,"")
			
 
				+                    if (title_before in dict_before and dict_before[title_before]>1) or title_after!="":
			
 
				+                        sentence_title = sentence_groups[0][0]
			
 
				+
			
 
				+                        title_index = sentence_groups[-2][1]
			
 
				+                        next_index = self.get_next_title(title_index)
			
 
				+
			
 
				+                        other_text = _text.replace(sentence_title_text,"")
			
 
				+
			
 
				+                        for p in products:
			
 
				+                            if other_text.strip()==p.strip():
			
 
				+                                has_product = True
			
 
				 
			
 
				+                    else:
			
 
				+                        _fix = False
			
 
				+
			
 
				+                        for p in products:
			
 
				+                            if other_text.strip()==p.strip():
			
 
				+                                title_before = "=产品"
			
 
				+                                sentence_title = "title_0"
			
 
				+                                sentence_title_text = p
			
 
				+                                title_index = "0"
			
 
				+                                title_after = "产品="
			
 
				+                                next_index = "0"
			
 
				+                                _fix = True
			
 
				+                                has_product = True
			
 
				+                                break
			
 
				+                        if not _fix:
			
 
				+                            title_before = None
			
 
				+                            title_after = None
			
 
				+                            sentence_title_text = None
			
 
				+                else:
			
 
				+                    if len(_text)<40 and re.search(_param_pattern,_text) is not None:
			
 
				+                        for p in products:
			
 
				+                            if _text.find(p)>=0:
			
 
				+                                title_before = "=产品"
			
 
				+                                sentence_title = "title_0"
			
 
				+                                sentence_title_text = p
			
 
				+                                title_index = "0"
			
 
				+                                title_after = "产品="
			
 
				+                                next_index = "0"
			
 
				+                                _fix = True
			
 
				+                                has_product = True
			
 
				+                                break
			
 
				 
			
 
				             if _type=="sentence":
			
 
				                 if sentence_title is None and len(list_data)>0 and list_data[-1]["sentence_title"] is not None and list_data[-1]["line_width"]>=max_length*0.6:
			
@@ -402,6 +448,8 @@ class ParseDocument():
 
				                 _table = _soup.find("table")
			
 
				                 if _table is not None:
			
 
				                     list_table = getTable(_table)
			
 
				+                    if len(list_table)==0:
			
 
				+                        continue
			
 
				                     table_columns = len(list_table[0])
			
 
				 
			
 
				                     if auto_merge_table:
			
@@ -428,7 +476,7 @@ class ParseDocument():
 
				                 _data = {"type":_type, "text":_text,"list_table":list_table,"line_width":len(_text),"sentence_title":sentence_title,"title_index":title_index,
			
 
				                          "sentence_title_text":sentence_title_text,"sentence_groups":sentence_groups,"parent_title":parent_title,
			
 
				                          "child_title":childs,"title_before":title_before,"title_after":title_after,"title_next":title_next,"next_index":next_index,
			
 
				-                         "block":block}
			
 
				+                         "block":block,"has_product":has_product}
			
 
				 
			
 
				                 if _type=="table":
			
 
				                     last_table = _data
			
@@ -543,22 +591,56 @@ class ParseDocument():
 
				 
			
 
				                 list_data.append(_data)
			
 
				 
			
 
				+        for _data in list_data:
			
 
				+
			
 
				+            childs = _data["child_title"]
			
 
				+
			
 
				+            for c_i in range(len(childs)):
			
 
				+                cdata = childs[c_i]
			
 
				+                if cdata["has_product"]:
			
 
				+                    continue
			
 
				+                else:
			
 
				+                    if c_i>0:
			
 
				+                        last_cdata = childs[c_i-1]
			
 
				+                        if cdata["sentence_title"] is not None and last_cdata["sentence_title"] is not None and last_cdata["title_before"]==cdata["title_before"] and last_cdata["title_after"]==cdata["title_after"] and last_cdata["has_product"]:
			
 
				+                            cdata["has_product"] = True
			
 
				+                    if c_i<len(childs)-1:
			
 
				+                        last_cdata = childs[c_i+1]
			
 
				+                        if cdata["sentence_title"] is not None and last_cdata["sentence_title"] is not None and last_cdata["title_before"]==cdata["title_before"] and last_cdata["title_after"]==cdata["title_after"] and last_cdata["has_product"]:
			
 
				+                            cdata["has_product"] = True
			
 
				+            for c_i in range(len(childs)):
			
 
				+                cdata = childs[len(childs)-1-c_i]
			
 
				+                if cdata["has_product"]:
			
 
				+                    continue
			
 
				+                else:
			
 
				+                    if c_i>0:
			
 
				+                        last_cdata = childs[c_i-1]
			
 
				+                        if cdata["sentence_title"] is not None and last_cdata["sentence_title"] is not None and last_cdata["title_before"]==cdata["title_before"] and last_cdata["title_after"]==cdata["title_after"] and last_cdata["has_product"]:
			
 
				+                            cdata["has_product"] = True
			
 
				+                    if c_i<len(childs)-1:
			
 
				+                        last_cdata = childs[c_i+1]
			
 
				+                        if cdata["sentence_title"] is not None and last_cdata["sentence_title"] is not None and last_cdata["title_before"]==cdata["title_before"] and last_cdata["title_after"]==cdata["title_after"] and last_cdata["has_product"]:
			
 
				+                            cdata["has_product"] = True
			
 
				+
			
 
				+
			
 
				         return list_data
			
 
				 
			
 
				+
			
 
				 def standard_title_context(_title_context):
			
 
				     return _title_context.replace("（","(").replace("）",")").replace("：",":").replace("：",";").replace("，",".").replace(",",".").replace("、",".").replace("．",".")
			
 
				 
			
 
				 def standard_product(sentence):
			
 
				     return sentence.replace("（","(").replace("）",")")
			
 
				 
			
 
				-def extract_products(list_data,_product,_param_pattern = "产品名称|采购内存|标的名称|采购内容|(标的|维修|系统|报价构成|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|配件|资产|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名|气体|标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|中标|成交|工程|招标内容)[\)）的]?([、\w]{,4}名称|内容|描述)|标的|标项|项目$|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品|物装|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|菜名|^品目$|^品名$|^名称|^内容$"):
			
 
				+def extract_products(list_data,_product,_param_pattern = "产品名称|设备材料|采购内存|标的名称|采购内容|(标的|维修|系统|报价构成|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|配件|资产|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名|气体|标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|中标|成交|工程|招标内容)[\)）的]?([、\w]{,4}名称|内容|描述)|标的|标项|项目$|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品|物装|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|菜名|^品目$|^品名$|^名称|^内容$"):
			
 
				     _product = standard_product(_product)
			
 
				     list_result = []
			
 
				+    list_table_products = []
			
 
				     for _data_i in range(len(list_data)):
			
 
				         _data = list_data[_data_i]
			
 
				         _type = _data["type"]
			
 
				         _text = _data["text"]
			
 
				-        table_products = []
			
 
				+
			
 
				         if _type=="table":
			
 
				             list_table = _data["list_table"]
			
 
				             if list_table is None:
			
@@ -584,6 +666,8 @@ def extract_products(list_data,_product,_param_pattern = "产品名称|采购内
 
				             for line_i in range(len(list_table)):
			
 
				                 line = list_table[line_i]
			
 
				                 for cell_i in list_head_index:
			
 
				+                    if cell_i>=len(line):
			
 
				+                        continue
			
 
				                     cell = line[cell_i]
			
 
				                     cell_text = cell[0]
			
 
				                     head_cell_text += cell_text
			
@@ -592,27 +676,29 @@ def extract_products(list_data,_product,_param_pattern = "产品名称|采购内
 
				             if re.search("招标人|采购人|项目编号|项目名称|金额|^\d+$",head_cell_text) is not None:
			
 
				                 list_head_index = []
			
 
				 
			
 
				-
			
 
				             for line in list_table:
			
 
				                 line_text = ",".join([cell[0] for cell in line])
			
 
				                 for cell_i in range(len(line)):
			
 
				                     cell = line[cell_i]
			
 
				                     cell_text = cell[0]
			
 
				-                    if cell_text is not None and _product is not None and len(cell_text)<len(_product)*10 and re.search(_product,cell_text) is not None and re.search("单价|数量|总价|规格|品牌|型号|用途|要求|采购量",line_text) is not None:
			
 
				+                    if cell_text is not None and _product is not None and len(cell_text)<len(_product)*10 and cell_text.find(_product)>=0 and re.search("单价|数量|总价|规格|品牌|型号|用途|要求|采购量",line_text) is not None:
			
 
				                         list_head_index.append(cell_i)
			
 
				 
			
 
				             list_head_index = list(set(list_head_index))
			
 
				             if len(list_head_index)>0:
			
 
				-                for line_i in range(_begin_index,len(list_table)):
			
 
				-                    line = list_table[line_i]
			
 
				-                    has_number = False
			
 
				-                    for cell_i in range(len(line)):
			
 
				-                        cell = line[cell_i]
			
 
				-                        cell_text = cell[0]
			
 
				-                        if re.search("^\d+$",cell_text) is not None:
			
 
				-                            has_number = True
			
 
				+                has_number = False
			
 
				+                for cell_i in list_head_index:
			
 
				+                    table_products = []
			
 
				+
			
 
				+                    for line_i in range(_begin_index,len(list_table)):
			
 
				+                        line = list_table[line_i]
			
 
				+
			
 
				+                        for _i in range(len(line)):
			
 
				+                            cell = line[_i]
			
 
				+                            cell_text = cell[0]
			
 
				+                            if re.search("^\d+$",cell_text) is not None:
			
 
				+                                has_number = True
			
 
				 
			
 
				-                    for cell_i in list_head_index:
			
 
				                         if cell_i>=len(line):
			
 
				                             continue
			
 
				                         cell = line[cell_i]
			
@@ -621,10 +707,23 @@ def extract_products(list_data,_product,_param_pattern = "产品名称|采购内
 
				                             if re.search("^[\da-zA-Z]+$",cell_text) is None:
			
 
				                                 table_products.append(cell_text)
			
 
				 
			
 
				-        if len(table_products)>0:
			
 
				-            if min([len(x) for x in table_products])>0 and max([len(x) for x in table_products])<=20:
			
 
				-                list_result.extend(table_products)
			
 
				-    list_result = list(set([a for a in list_result if len(a)>1 and len(a)<20 and re.search("预算|合计|金额|万元|运费",a) is None]))
			
 
				+                    if len(table_products)>0:
			
 
				+                        logger.debug("table products %s"%(str(table_products)))
			
 
				+                        if min([len(x) for x in table_products])>0 and max([len(x) for x in table_products])<=30:
			
 
				+                            if re.search("招标人|代理人|预算|数量|交货期|品牌|产地","".join(table_products)) is None:
			
 
				+                                list_table_products.append(table_products)
			
 
				+    _find = False
			
 
				+    for table_products in list_table_products:
			
 
				+        for _p in table_products:
			
 
				+            if is_similar(_product,_p,90):
			
 
				+                _find = True
			
 
				+                logger.debug("similar table_products %s"%(str(table_products)))
			
 
				+                list_result = list(set([a for a in table_products if len(a)>1 and len(a)<20 and re.search("费用|预算|合计|金额|万元|运费|^其他$",a) is None]))
			
 
				+                break
			
 
				+    if not _find:
			
 
				+        for table_products in list_table_products:
			
 
				+            list_result.extend(table_products)
			
 
				+        list_result = list(set([a for a in list_result if len(a)>1 and len(a)<30 and re.search("费用|预算|合计|金额|万元|运费",a) is None]))
			
 
				     return list_result
			
 
				 
			
 
				 
			
@@ -671,12 +770,20 @@ def get_correct_product(product,products):
 
				 def get_childs_text(childs,_product,products,is_begin=False,is_end=False):
			
 
				     _text = ""
			
 
				 
			
 
				+    end_next = False
			
 
				     for _child in childs:
			
 
				 
			
 
				         child_text = _child.get("text")
			
 
				 
			
 
				+
			
 
				         if child_text.find(_product)>=0:
			
 
				-            is_begin = True
			
 
				+            if not is_begin:
			
 
				+                is_begin = True
			
 
				+                if not end_next:
			
 
				+                    if _child["sentence_title"] is not None and isinstance(_child["title_next"],dict) and _child["title_next"]["sentence_title"] is not None:
			
 
				+                        end_next = True
			
 
				+                        end_title = _child["title_next"]
			
 
				+                        logger.debug("end_title %s "%end_title["text"])
			
 
				 
			
 
				         logger.debug("%s-%s-%s"%("get_childs_text",child_text[:10],str(is_begin)))
			
 
				 
			
@@ -684,13 +791,15 @@ def get_childs_text(childs,_product,products,is_begin=False,is_end=False):
 
				             if child_text.find(p)>=0 and is_similar(_product,p,90):
			
 
				                 is_begin = True
			
 
				 
			
 
				-            if child_text.find(_product)<0 and  child_text.find(p)>=0 and not is_similar(_product,p,80):
			
 
				+            if child_text.find(_product)<0  and not is_similar(_product,p,80) and  (child_text.find(p)>=0 or _child["has_product"]):
			
 
				                 if is_begin:
			
 
				                     is_end = True
			
 
				+                    logger.debug("%s-%s-%s"%("get_childs_text end",child_text[:10],p))
			
 
				                 break
			
 
				         if re.search(end_pattern,child_text) is not None:
			
 
				             if is_begin:
			
 
				                 is_end = True
			
 
				+                logger.debug("%s-%s-%s"%("get_childs_text end",child_text[:10],str(is_end)))
			
 
				 
			
 
				         if is_begin and is_end:
			
 
				             break
			
@@ -699,45 +808,58 @@ def get_childs_text(childs,_product,products,is_begin=False,is_end=False):
 
				             _text += _child.get("text")+"\r\n"
			
 
				         childs2 = _child.get("child_title",[])
			
 
				 
			
 
				+
			
 
				         if len(childs2)>0:
			
 
				             for _child2 in childs2:
			
 
				                 child_text,is_begin,is_end = get_childs_text([_child2],_product,products,is_begin)
			
 
				-                if is_begin and is_end:
			
 
				-                    break
			
 
				-                else:
			
 
				-                    if is_begin:
			
 
				-                        _text += child_text
			
 
				+                if is_begin:
			
 
				+                    _text += child_text
			
 
				+                    if is_end:
			
 
				+                        break
			
 
				+
			
 
				+        if end_next:
			
 
				+            is_end = True
			
 
				+
			
 
				+    #     logger.debug("%s-%s-%s"%("get_childs_text1",_text,str(is_begin)))
			
 
				+    # logger.debug("%s-%s-%s"%("get_childs_text2",_text,str(is_begin)))
			
 
				     return _text,is_begin,is_end
			
 
				 
			
 
				 def extract_parameters_by_tree(_product,products,list_data,_data_i,parent_title,list_result,):
			
 
				     _data = list_data[_data_i]
			
 
				     childs = _data.get("child_title",[])
			
 
				     if len(childs)>0:
			
 
				-        child_text,_,_ = get_childs_text([parent_title],_product,products)
			
 
				-        logger.info("extract_parameters_by_tree child_text:%s"%child_text)
			
 
				+        child_text,_,_ = get_childs_text([_data],_product,products)
			
 
				         if len(child_text)>0:
			
 
				+            logger.info("extract_type by_tree child_text:%s"%child_text)
			
 
				             list_result.append(child_text)
			
 
				-            return True
			
 
				     if parent_title is not None:
			
 
				+        child_text,_,_ = get_childs_text([parent_title],_product,products)
			
 
				+        if len(child_text)>0:
			
 
				+            logger.info("extract_type by_tree child_text:%s"%child_text)
			
 
				+            list_result.append(child_text)
			
 
				+
			
 
				         childs = parent_title.get("child_title",[])
			
 
				         if len(childs)>0:
			
 
				 
			
 
				             range_data = get_range_data_by_childs(list_data[_data_i:],childs)
			
 
				             p_text = ""
			
 
				             _find = False
			
 
				+            end_id = id(_data["title_next"]) if isinstance(_data["sentence_title"],dict) and _data["title_next"] is not None and _data["title_next"]["sentence_title"] is not None else None
			
 
				             for pdata in range_data:
			
 
				-                ptype = _data["type"]
			
 
				                 ptext = pdata["text"]
			
 
				                 for p in products:
			
 
				-                    if ptext.find(_product)<0 and  ptext.find(p)>=0:
			
 
				+                    if ptext.find(_product)<0 and  (ptext.find(p)>=0 or pdata["has_product"]):
			
 
				                         _find = True
			
 
				                         break
			
 
				                 if re.search(end_pattern,ptext) is not None:
			
 
				                     _find = True
			
 
				                 if _find:
			
 
				                     break
			
 
				+                if id(pdata)==end_id:
			
 
				+                    break
			
 
				                 p_text += ptext+"\r\n"
			
 
				             if len(p_text)>0:
			
 
				+                logger.debug("extract_type by parent range_text:%s"%p_text)
			
 
				                 list_result.append(p_text)
			
 
				                 return True
			
 
				     return False
			
@@ -766,6 +888,7 @@ def get_table_pieces(_text,_product,products,list_result,_find):
 
				                 list_trs.append(tr)
			
 
				         if len(list_trs)>0:
			
 
				             table_html = "<table>%s</table>"%("\r\n".join([str(a) for a in list_trs]))
			
 
				+            logger.debug("extract_type table slices %s"%(table_html))
			
 
				             list_result.append(table_html)
			
 
				 
			
 
				 def extract_parameters_by_table(_product,products,_param_pattern,list_data,_data_i,list_result):
			
@@ -778,8 +901,9 @@ def extract_parameters_by_table(_product,products,_param_pattern,list_data,_data
 
				         max_length = max([len(a) for a in list_table])
			
 
				         min_length = min([len(a) for a in list_table])
			
 
				         text_line_first = ",".join(a[0] for a in list_table[0])
			
 
				-        if min_length<max_length/2:
			
 
				-            return
			
 
				+        if max_length>10:
			
 
				+            if min_length<max_length/2:
			
 
				+                return
			
 
				         last_data = list_data[_data_i-1]
			
 
				         _flag = False
			
 
				         if last_data["type"]=="sentence" and last_data["text"].find(_product)>=0:
			
@@ -789,8 +913,8 @@ def extract_parameters_by_table(_product,products,_param_pattern,list_data,_data
 
				         if re.search(_param_pattern,text_line_first) is not None and text_line_first.find(_product)>=0:
			
 
				             _flag = True
			
 
				         if _flag:
			
 
				-            logger.debug("extract_type add all table %s"%_text)
			
 
				             if len(products)==0:
			
 
				+                logger.debug("extract_type whole table by param and product %s"%(_text))
			
 
				                 list_result.append(_text)
			
 
				             else:
			
 
				                 for p in products:
			
@@ -813,20 +937,99 @@ def extract_parameters_by_table(_product,products,_param_pattern,list_data,_data
 
				             for line in list_table:
			
 
				                 for cell in line:
			
 
				                     cell_text = cell[0]
			
 
				-                    if len(cell_text)>50 and len(re.findall("\d+",cell_text))>10 and cell_text.find(_product)>=0:
			
 
				-                        list_result.append(cell_text)
			
 
				+                    if len(cell_text)>50 and len(re.findall(meter_pattern,cell_text))>5 and cell_text.find(_product)>=0:
			
 
				+                        _f = True
			
 
				+                        for cell in line:
			
 
				+                            if not _f:
			
 
				+                                break
			
 
				+                            cell_text = cell[0]
			
 
				+                            for p in products:
			
 
				+                                if cell_text.find(p)>=0 and p!=_product:
			
 
				+                                    _f = False
			
 
				+                                    break
			
 
				+                        if _f:
			
 
				+                            logger.debug("extract_type param column %s"%(cell_text))
			
 
				+                            list_result.append(cell_text)
			
 
				                     if len(cell_text)<len(_product)*10 and str(cell_text).find(_product)>=0:
			
 
				                         for _index in list_head_index:
			
 
				                             if _index>=len(line):
			
 
				                                 continue
			
 
				                             _cell = line[_index]
			
 
				                             if len(cell[0])>0:
			
 
				-                                logger.info("%s-%s"%("add on table",_cell[0]))
			
 
				+                                logger.info("%s-%s"%("extract_type add on table text:",_cell[0]))
			
 
				                                 list_result.append(_cell[0])
			
 
				         if not _flag and (re.search(_param_pattern,_text) is not None or (parent_title is not None and re.search(_param_pattern,parent_title["text"]) is not None)) and _text.find(_product)>=0:
			
 
				             get_table_pieces(_text,_product,products,list_result,False)
			
 
				 
			
 
				 
			
 
				+def extract_parameters_by_sentence(list_data,_data,_data_i,_product,products,list_result,is_project):
			
 
				+    _text = _data["text"]
			
 
				+    if _text.find(_product)>=0:
			
 
				+        parent_title = _data.get("parent_title")
			
 
				+        parent_text = ""
			
 
				+        parent_parent_title = None
			
 
				+        parent_parent_text = ""
			
 
				+        parent_title_index = None
			
 
				+        parent_parent_title_index = None
			
 
				+        childs = get_childs([_data])
			
 
				+
			
 
				+        child_find = False
			
 
				+        for c in childs:
			
 
				+            if re.search(_param_pattern,c["text"]) is not None and len(c["text"])<30:
			
 
				+                logger.debug("child text %s"%(c["text"]))
			
 
				+                child_find = True
			
 
				+                break
			
 
				+
			
 
				+        extract_text,_,_ = get_childs_text([_data],_product,products)
			
 
				+        logger.debug("childs found extract_text %s %s"%(str(child_find),extract_text))
			
 
				+        if child_find:
			
 
				+            if len(extract_text)>0:
			
 
				+                list_result.append(extract_text)
			
 
				+        else:
			
 
				+            limit_nums = len(_product)*2+5
			
 
				+            if len(_product)<=3:
			
 
				+                limit_nums += 6
			
 
				+            if _text.find("数量")>=0:
			
 
				+                limit_nums += 6
			
 
				+            if len(_text)<=limit_nums and _data["sentence_title"] is not None:
			
 
				+                if re.search(meter_pattern,extract_text) is not None:
			
 
				+                    list_result.append(extract_text)
			
 
				+            elif len(re.findall(meter_pattern,extract_text))>2:
			
 
				+                list_result.append(extract_text)
			
 
				+
			
 
				+        if parent_title is not None:
			
 
				+            parent_text = parent_title.get("text","")
			
 
				+            parent_parent_title = parent_title.get("parent_title")
			
 
				+            parent_title_index = parent_title["title_index"]
			
 
				+            if parent_parent_title is not None:
			
 
				+                parent_parent_text = parent_parent_title.get("text","")
			
 
				+                parent_parent_title_index = parent_parent_title["title_index"]
			
 
				+
			
 
				+        _suit = False
			
 
				+        if re.search(_param_pattern,_text) is not None and len(_text)<50:
			
 
				+            _suit = True
			
 
				+        if re.search(_param_pattern,parent_text) is not None and len(parent_text)<50:
			
 
				+            _suit = True
			
 
				+        if re.search(_param_pattern,parent_parent_text) is not None and len(parent_parent_text)<50:
			
 
				+            _suit = True
			
 
				+        if _suit:
			
 
				+            logger.debug("extract_type sentence %s"%("extract_parameters_by_tree"))
			
 
				+            if not extract_parameters_by_tree(_product,products,list_data,_data_i,parent_title,list_result):
			
 
				+                logger.debug("extract_type sentence %s"%("extract_parameters_by_tree"))
			
 
				+                extract_parameters_by_tree(_product,products,list_data,_data_i,parent_parent_title,list_result)
			
 
				+
			
 
				+    if re.search(_param_pattern,_text) is not None and len(_text)<50:
			
 
				+        childs = _data["child_title"]
			
 
				+        if len(childs)>0:
			
 
				+            extract_text,_,_ = get_childs_text([_data],_product,products)
			
 
				+            if len(extract_text)>0:
			
 
				+                logger.debug("extract_type param-product %s"%(extract_text))
			
 
				+                list_result.append(extract_text)
			
 
				+            elif is_project:
			
 
				+                extract_text,_,_ = get_childs_text([_data],_product,products,is_begin=True)
			
 
				+                if len(extract_text)>0 and re.search(meter_pattern,extract_text) is not None:
			
 
				+                    logger.debug("extract_type sentence is_project param-product is product %s"%(extract_text))
			
 
				+                    list_result.append(extract_text)
			
 
				 
			
 
				 def getBestProductText(list_result,_product,products):
			
 
				     list_result.sort(key=lambda x:len(re.findall(meter_pattern+"|"+'[:：;；]|\d+[%A-Za-z]+',BeautifulSoup(x,"html5lib").get_text())), reverse=True)
			
@@ -840,7 +1043,7 @@ def getBestProductText(list_result,_product,products):
 
				         _result = list_result[i]
			
 
				         _check = True
			
 
				         _result_text = BeautifulSoup(_result,"html5lib").get_text()
			
 
				-        _search = re.search("项目编号[:：]|项目名称[:：]|联合体投标",_result)
			
 
				+        _search = re.search("项目编号[:：]|项目名称[:：]|联合体投标|开户银行",_result)
			
 
				         if _search is not None:
			
 
				             logger.debug("result%d error illegal text %s"%(i,str(_search)))
			
 
				             _check = False
			
@@ -849,13 +1052,31 @@ def getBestProductText(list_result,_product,products):
 
				                 if _result_text.find(p)>0 and not (is_similar(_product,p,80) or p.find(_product)>=0 or _product.find(p)>=0):
			
 
				                     logger.debug("result%d error product scoss %s"%(i,p))
			
 
				                     _check = False
			
 
				-        if len(_result_text)<50:
			
 
				+        if len(_result_text)<100:
			
 
				             if re.search(meter_pattern,_result_text) is None:
			
 
				                 logger.debug("result%d error text min count"%(i))
			
 
				                 _check = False
			
 
				         if len(_result_text)>5000:
			
 
				-            logger.debug("result%d error text max count"%(i))
			
 
				+            if len(_result_text)>10000:
			
 
				+                logger.debug("result%d error text max count"%(i))
			
 
				+                _check = False
			
 
				+            elif len(re.findall(meter_pattern,_result_text))<10:
			
 
				+                logger.debug("result%d error text max count less meter"%(i))
			
 
				+                _check = False
			
 
				+
			
 
				+        list_find = list(set(re.findall(meter_pattern,_result_text)))
			
 
				+
			
 
				+        not_list_find = list(set(re.findall(not_meter_pattern,_result_text)))
			
 
				+        _count = len(list_find)-len(not_list_find)
			
 
				+        has_num = False
			
 
				+        for _find in list_find:
			
 
				+            if re.search('[0-9a-zA-Z]',_find) is not None:
			
 
				+                has_num = True
			
 
				+                break
			
 
				+        if not(_count>=2 and has_num or _count>=5):
			
 
				+            logger.debug("result%d error match not enough"%(i))
			
 
				             _check = False
			
 
				+
			
 
				         if _check:
			
 
				             return _result
			
 
				 
			
@@ -868,6 +1089,11 @@ def extract_product_parameters(list_data,_product):
 
				     _product = get_correct_product(_product,products)
			
 
				     logger.debug("all products %s-%s"%(_product,str(products)))
			
 
				     is_project = False
			
 
				+    if re.search("项目名称|采购项目",_product) is not None:
			
 
				+        is_project = True
			
 
				+        
			
 
				+    if len(products)==1 and is_similar(products[0],_product,90):
			
 
				+        is_project = True
			
 
				     _find_count = 0
			
 
				     for _data_i in range(len(list_data)):
			
 
				         _data = list_data[_data_i]
			
@@ -876,84 +1102,23 @@ def extract_product_parameters(list_data,_product):
 
				         if _type=="sentence":
			
 
				             if _text.find(_product)>=0:
			
 
				                 _find_count += 1
			
 
				-                if re.search("项目名称|采购项目",_text) is not None:
			
 
				-                   is_project = True
			
 
				-                if re.search("项目名称|采购项目",_product) is not None:
			
 
				+                if re.search("项目名称|采购项目",_text) is not None and re.search("等",_text) is not None:
			
 
				                     is_project = True
			
 
				-                parent_title = _data.get("parent_title")
			
 
				-                parent_text = ""
			
 
				-                parent_parent_title = None
			
 
				-                parent_parent_text = ""
			
 
				-                parent_title_index = None
			
 
				-                parent_parent_title_index = None
			
 
				-                childs = get_childs([_data])
			
 
				-
			
 
				-
			
 
				-                child_find = False
			
 
				-                for c in childs:
			
 
				-                    if re.search(_param_pattern,c["text"]) is not None and len(c["text"])<30:
			
 
				-                        child_find = True
			
 
				-                        break
			
 
				-
			
 
				-                extract_text,_,_ = get_childs_text([_data],_product,products)
			
 
				-                logger.debug("childs found extract_text %s"%extract_text)
			
 
				-                if child_find:
			
 
				-                    if len(extract_text)>0:
			
 
				-                        list_result.append(extract_text)
			
 
				-                else:
			
 
				-                    if len(_text)<len(_product)+10 and _data["sentence_title"] is not None:
			
 
				-                        if re.search(meter_pattern,extract_text) is not None:
			
 
				-                            list_result.append(extract_text)
			
 
				-
			
 
				-                if parent_title is not None:
			
 
				-                    parent_text = parent_title.get("text","")
			
 
				-                    parent_parent_title = parent_title.get("parent_title")
			
 
				-                    parent_title_index = parent_title["title_index"]
			
 
				-                    if parent_parent_title is not None:
			
 
				-                        parent_parent_text = parent_parent_title.get("text","")
			
 
				-                        parent_parent_title_index = parent_parent_title["title_index"]
			
 
				-
			
 
				-                _suit = False
			
 
				-                if re.search(_param_pattern,_text) is not None and len(_text)<50:
			
 
				-                    _suit = True
			
 
				-                if re.search(_param_pattern,parent_text) is not None and len(parent_text)<50:
			
 
				-                    _suit = True
			
 
				-                if re.search(_param_pattern,parent_parent_text) is not None and len(parent_parent_text)<50:
			
 
				-                    _suit = True
			
 
				-                if _suit:
			
 
				-                    logger.debug("extract_type sentence %s"%("extract_parameters_by_tree"))
			
 
				-                    if not extract_parameters_by_tree(_product,products,list_data,_data_i,parent_title,list_result):
			
 
				-                        logger.debug("extract_type sentence %s"%("extract_parameters_by_tree"))
			
 
				-                        extract_parameters_by_tree(_product,products,list_data,_data_i,parent_parent_title,list_result)
			
 
				-
			
 
				-
			
 
				-            if re.search(_param_pattern,_text) is not None and len(_text)<50:
			
 
				-                childs = _data["child_title"]
			
 
				-                if len(childs)>0:
			
 
				-                    logger.debug("extract_type sentence %s"%("re.search(_param_pattern,_text) is not None and len(_text)<50:"))
			
 
				-                    extract_text,_,_ = get_childs_text([_data],_product,products)
			
 
				-                    if len(extract_text)>0:
			
 
				-                        list_result.append(extract_text)
			
 
				-                    elif is_project:
			
 
				-                        logger.debug("extract_type sentence is_project")
			
 
				-                        extract_text,_,_ = get_childs_text([_data],_product,products,is_begin=True)
			
 
				-                        if len(extract_text)>0 and re.search(meter_pattern,extract_text) is not None:
			
 
				-                            list_result.append(extract_text)
			
 
				-
			
 
				+            extract_parameters_by_sentence(list_data,_data,_data_i,_product,products,list_result,is_project)
			
 
				 
			
 
				         elif _type=="table":
			
 
				             if _text.find(_product)>=0:
			
 
				                 _find_count += 1
			
 
				             extract_parameters_by_table(_product,products,_param_pattern,list_data,_data_i,list_result)
			
 
				 
			
 
				-
			
 
				-    return getBestProductText(list_result,_product,products),_find_count
			
 
				+    _text = getBestProductText(list_result,_product,products)
			
 
				+    return _text,_find_count
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				 
			
 
				-    filepath = "download/8679fef3a6fff56abcbdaccb1a190c80.html"
			
 
				-    _product = "移液器"
			
 
				+    filepath = "download/4597dcc128bfabc7584d10590ae50656.html"
			
 
				+    _product = "彩色多普勒超声诊断仪"
			
 
				 
			
 
				     _html = open(filepath, "r", encoding="utf8").read()
			
 
				 
			
@@ -965,5 +1130,6 @@ if __name__ == '__main__':
 
				 
			
 
				     _text,_count = extract_product_parameters(list_data,_product)
			
 
				     logger.info("find count:%d"%(_count))
			
 
				-    logger.info("extract_text %s"%_text)
			
 
				+    logger.info("extract_parameter_text::%s"%(_text))
			
 
				+
			
 
				 
			
--- a/BaseDataMaintenance/maintenance/product/productUtils.py
+++ b/BaseDataMaintenance/maintenance/product/productUtils.py
@@ -203,7 +203,6 @@ def jaccard_score(source,target):
 
				     return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
			
 
				 
			
 
				 
			
 
				-from fuzzywuzzy import fuzz
			
 
				 def is_similar(source,target,_radio=None):
			
 
				     source = str(source).lower()
			
 
				     target = str(target).lower()
			
@@ -227,7 +226,7 @@ def is_similar(source,target,_radio=None):
 
				     if min_len<2:
			
 
				         return False
			
 
				     #判断相似度
			
 
				-    similar = fuzz.ratio(source,target)
			
 
				+    similar = Levenshtein.ratio(source,target)*100
			
 
				     if similar>=min_ratio:
			
 
				         log("%s and %s similar_jaro %d"%(source,target,similar))
			
 
				         return True
			
--- a/BaseDataMaintenance/maintenance/product/product_attachment.py
+++ b/BaseDataMaintenance/maintenance/product/product_attachment.py
@@ -1,247 +0,0 @@
 
				-
			
 
				-
			
 
				-
			
 
				-from apscheduler.schedulers.blocking import BlockingScheduler
			
 
				-from tablestore import *
			
 
				-from BaseDataMaintenance.dataSource.source import getConnect_ots,getAuth,is_internal
			
 
				-from BaseDataMaintenance.dataSource.interface import *
			
 
				-from multiprocessing import Queue as PQueue,Process
			
 
				-from BaseDataMaintenance.model.ots.document_product import *
			
 
				-from BaseDataMaintenance.model.ots.attachment import *
			
 
				-from BaseDataMaintenance.common.Utils import *
			
 
				-from BaseDataMaintenance.common.ossUtils import *
			
 
				-from BaseDataMaintenance.maintenance.product.htmlparser import *
			
 
				-import oss2
			
 
				-from BaseDataMaintenance.common.multiThread import MultiThreadHandler
			
 
				-
			
 
				-parameter_status_no_bidfile = -1
			
 
				-parameter_status_to_process = 0
			
 
				-parameter_status_process_succeed = 1
			
 
				-parameter_status_process_failed = 2
			
 
				-parameter_status_process_jump = 3
			
 
				-parameter_status_not_found = 4
			
 
				-
			
 
				-class Product_Attachment_Processor():
			
 
				-
			
 
				-    def __init__(self,):
			
 
				-        self.ots_client = getConnect_ots()
			
 
				-        self.product_attachment_queue = PQueue()
			
 
				-        self.product_attachment_queue_size = 100
			
 
				-        self.set_product_attachment = set()
			
 
				-        self.attachment_hub_url = "https://attachment-hub.oss-cn-hangzhou.aliyuncs.com/"
			
 
				-        self.auth = getAuth()
			
 
				-        oss2.defaults.connection_pool_size = 100
			
 
				-        oss2.defaults.multiget_num_threads = 20
			
 
				-        if is_internal:
			
 
				-            self.bucket_url = "http://oss-cn-hangzhou-internal.aliyuncs.com"
			
 
				-        else:
			
 
				-            self.bucket_url = "http://oss-cn-hangzhou.aliyuncs.com"
			
 
				-        log("bucket_url:%s"%(self.bucket_url))
			
 
				-        self.attachment_bucket_name = "attachment-hub"
			
 
				-        self.bucket = oss2.Bucket(self.auth,self.bucket_url,self.attachment_bucket_name)
			
 
				-        self.current_path = os.path.dirname(__file__)
			
 
				-        self.download_path = "%s/%s"%(self.current_path,"download")
			
 
				-
			
 
				-    def process_parameters_producer(self,):
			
 
				-
			
 
				-        if self.product_attachment_queue.qsize()>self.product_attachment_queue_size/3:
			
 
				-            return
			
 
				-        bool_query = BoolQuery(must_queries=[
			
 
				-            TermQuery("parameter_status",parameter_status_to_process)
			
 
				-        ])
			
 
				-        list_id = []
			
 
				-        rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
			
 
				-                                                                            SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("parameter_status")]),limit=100,get_total_count=True),
			
 
				-                                                                            ColumnsToGet([DOCUMENT_PRODUCT_BID_FILEMD5S,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME],return_type=ColumnReturnType.SPECIFIED))
			
 
				-
			
 
				-        list_data = getRow_ots(rows)
			
 
				-        for data in list_data:
			
 
				-            _id = data.get(DOCUMENT_PRODUCT_ID)
			
 
				-            if _id in self.set_product_attachment:
			
 
				-                continue
			
 
				-            self.product_attachment_queue.put(data)
			
 
				-            list_id.append(_id)
			
 
				-        while next_token:
			
 
				-            if self.product_attachment_queue.qsize()>=self.product_attachment_queue_size:
			
 
				-                break
			
 
				-            rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
			
 
				-                                                                                SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
			
 
				-                                                                                ColumnsToGet([DOCUMENT_PRODUCT_BID_FILEMD5S,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME],return_type=ColumnReturnType.SPECIFIED))
			
 
				-            list_data = getRow_ots(rows)
			
 
				-            for data in list_data:
			
 
				-                _id = data.get(DOCUMENT_PRODUCT_ID)
			
 
				-                if _id in self.set_product_attachment:
			
 
				-                    continue
			
 
				-                self.product_attachment_queue.put(data)
			
 
				-                list_id.append(_id)
			
 
				-        self.set_product_attachment =  set(list_id)
			
 
				-
			
 
				-    def process_parameters_handler(self,item,result_queue):
			
 
				-        bid_filemd5s = item.get(DOCUMENT_PRODUCT_BID_FILEMD5S)
			
 
				-        product_name = item.get(DOCUMENT_PRODUCT_NAME)
			
 
				-        product_original_name = item.get(DOCUMENT_PRODUCT_ORIGINAL_NAME)
			
 
				-        list_product = []
			
 
				-        if product_name is not None:
			
 
				-            list_product.append(product_name)
			
 
				-        if product_original_name is not None:
			
 
				-            list_product.extend(product_original_name.split("_"))
			
 
				-        list_product = list(set(list_product))
			
 
				-        dp = Document_product(item)
			
 
				-        if bid_filemd5s is None or bid_filemd5s=="" or len(list_product)==0:
			
 
				-            dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_no_bidfile)
			
 
				-            dp.update_row(self.ots_client)
			
 
				-            return
			
 
				-        list_filemd5 = bid_filemd5s.split(",")
			
 
				-        _find = False
			
 
				-        _success = False
			
 
				-        for _filemd5 in list_filemd5:
			
 
				-            if _find:
			
 
				-                break
			
 
				-            atta = attachment({attachment_filemd5:_filemd5})
			
 
				-            if atta.fix_columns(self.ots_client,[attachment_path,attachment_filetype],True):
			
 
				-                objectPath = atta.getProperties().get(attachment_path)
			
 
				-                _filetype = atta.getProperties().get(attachment_filetype)
			
 
				-                if _filetype in ("doc","xls"):
			
 
				-                    if len(list_filemd5)==1:
			
 
				-                        dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_jump,True)
			
 
				-                        dp.update_row(self.ots_client)
			
 
				-                        return
			
 
				-                    else:
			
 
				-                        continue
			
 
				-                localpath = "%s/%s.%s"%(self.download_path,_filemd5,_filetype)
			
 
				-                localhtml = "%s/%s.%s"%(self.download_path,_filemd5,"html")
			
 
				-                download_succeed = False
			
 
				-                try:
			
 
				-                    if not os.path.exists(localpath):
			
 
				-                        download_succeed = downloadFile(self.bucket,objectPath,localpath)
			
 
				-                    else:
			
 
				-                        download_succeed = True
			
 
				-                except Exception as e:
			
 
				-                    download_succeed = False
			
 
				-                if download_succeed:
			
 
				-                    try:
			
 
				-                        _html = ""
			
 
				-                        if os.path.exists(localhtml):
			
 
				-                            _html = open(localhtml,"r",encoding="utf8").read()
			
 
				-                            _success = True
			
 
				-                        if len(_html)>10:
			
 
				-                            _success = True
			
 
				-                        else:
			
 
				-                            _data_base64 = base64.b64encode(open(localpath,"rb").read())
			
 
				-                            _success,_html,swf_images,classification = getAttachDealInterface(_data_base64,_filetype,url="http://192.168.2.102:15011/convert",kwargs={'page_no': '1,-1',"max_bytes":"-1"},timeout=6000)
			
 
				-                            if _success:
			
 
				-                                localhtml = "%s/%s.%s"%(self.download_path,_filemd5,"html")
			
 
				-                                with open(localhtml,"w",encoding="utf8") as f:
			
 
				-                                    f.write(_html)
			
 
				-                        if _success:
			
 
				-                            if len(_html)>5:
			
 
				-                                pd = ParseDocument(_html,True)
			
 
				-
			
 
				-                                list_text = []
			
 
				-                                for _product in list_product:
			
 
				-                                    pd.fix_tree(_product)
			
 
				-                                    list_data = pd.tree
			
 
				-                                    _text,_count = extract_product_parameters(list_data,_product)
			
 
				-                                    if _count>0:
			
 
				-                                        _find = True
			
 
				-                                    if _text is not None:
			
 
				-                                        list_text.append(_text)
			
 
				-                                pd = ParseDocument(_html,False)
			
 
				-
			
 
				-                                list_text = []
			
 
				-                                for _product in list_product:
			
 
				-                                    pd.fix_tree(_product)
			
 
				-                                    list_data = pd.tree
			
 
				-                                    _text,_count = extract_product_parameters(list_data,_product)
			
 
				-                                    if _count>0:
			
 
				-                                        _find = True
			
 
				-                                    if _text is not None:
			
 
				-                                        list_text.append(_text)
			
 
				-                                if len(list_text)>0:
			
 
				-                                    list_text.sort(key=lambda x:len(re.findall('[:：;；]',BeautifulSoup(x,"html5lib").get_text())), reverse=True)
			
 
				-                                    _text = list_text[0]
			
 
				-                                    _success = True
			
 
				-                                    dp.setValue(DOCUMENT_PRODUCT_PARAMETER,_text,True)
			
 
				-                                    dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_succeed,True)
			
 
				-                                    dp.update_row(self.ots_client)
			
 
				-                                    return
			
 
				-                            else:
			
 
				-                                log("product attachment process filemd5 %s has no content"%(_filemd5))
			
 
				-                    except Exception as e:
			
 
				-                        traceback.print_exc()
			
 
				-                    finally:
			
 
				-                        try:
			
 
				-                            # if os.path.exists(localpath):
			
 
				-                            #     os.remove(localpath)
			
 
				-                            pass
			
 
				-                        except Exception as e:
			
 
				-                            pass
			
 
				-
			
 
				-        if not _find:
			
 
				-            dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_not_found,True)
			
 
				-            dp.update_row(self.ots_client)
			
 
				-        else:
			
 
				-            dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_failed,True)
			
 
				-            dp.update_row(self.ots_client)
			
 
				-
			
 
				-    def start_process(self):
			
 
				-        mt = MultiThreadHandler(self.product_attachment_queue,self.process_parameters_handler,None,3,need_stop=False,restart=True)
			
 
				-        mt.run()
			
 
				-
			
 
				-    def process_parameters_comsumer(self,):
			
 
				-
			
 
				-        # process_count = 2
			
 
				-        # list_process = []
			
 
				-        # for i in range(process_count):
			
 
				-        #     p = Process(target=self.start_process)
			
 
				-        #     list_process.append(p)
			
 
				-        # for p in list_process:
			
 
				-        #     p.start()
			
 
				-        # for p in list_process:
			
 
				-        #     p.join()
			
 
				-
			
 
				-        self.start_process()
			
 
				-
			
 
				-    def start_process_parameters(self):
			
 
				-        scheduler = BlockingScheduler()
			
 
				-        scheduler.add_job(self.process_parameters_producer,"cron",second="*/10")
			
 
				-        scheduler.add_job(self.process_parameters_comsumer,"cron",second="*/30")
			
 
				-        scheduler.start()
			
 
				-
			
 
				-def start_process_parameters():
			
 
				-    pap = Product_Attachment_Processor()
			
 
				-    pap.start_process_parameters()
			
 
				-
			
 
				-def change_parameters_status():
			
 
				-    ots_client =getConnect_ots()
			
 
				-    bool_query = BoolQuery(must_queries=[
			
 
				-        RangeQuery("parameter_status",-1)
			
 
				-    ],
			
 
				-                           must_not_queries=[
			
 
				-        TermQuery("parameter_status",parameter_status_to_process),
			
 
				-        TermQuery("parameter_status",parameter_status_process_succeed),
			
 
				-        TermQuery("parameter_status",parameter_status_process_jump),
			
 
				-        # TermQuery("parameter_status",parameter_status_no_bidfile),
			
 
				-
			
 
				-    ])
			
 
				-    list_data = []
			
 
				-    rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
			
 
				-                                                                        SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("parameter_status")]),limit=100,get_total_count=True),
			
 
				-                                                                        ColumnsToGet([DOCUMENT_PRODUCT_BID_FILEMD5S,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME],return_type=ColumnReturnType.SPECIFIED))
			
 
				-
			
 
				-    list_data.extend(getRow_ots(rows))
			
 
				-    print("total_count",total_count)
			
 
				-    while next_token:
			
 
				-        rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
			
 
				-                                                                            SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
			
 
				-                                                                            ColumnsToGet([DOCUMENT_PRODUCT_BID_FILEMD5S],return_type=ColumnReturnType.SPECIFIED))
			
 
				-        list_data.extend(getRow_ots(rows))
			
 
				-    for data in list_data:
			
 
				-        dp = Document_product(data)
			
 
				-        dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_to_process,True)
			
 
				-        dp.setValue(DOCUMENT_PRODUCT_PARAMETER,"",True)
			
 
				-        dp.update_row(ots_client)
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    start_process_parameters()
			
 
				-    # change_parameters_status()
			
--- a/BaseDataMaintenance/maintenance/product/product_parameter.py
+++ b/BaseDataMaintenance/maintenance/product/product_parameter.py
@@ -0,0 +1,322 @@
 
				+
			
 
				+
			
 
				+
			
 
				+from apscheduler.schedulers.blocking import BlockingScheduler
			
 
				+from tablestore import *
			
 
				+from BaseDataMaintenance.dataSource.source import getConnect_ots,getAuth,is_internal
			
 
				+from BaseDataMaintenance.dataSource.interface import *
			
 
				+from multiprocessing import Queue as PQueue
			
 
				+from multiprocessing import Process
			
 
				+from BaseDataMaintenance.model.ots.document_product import *
			
 
				+from BaseDataMaintenance.model.ots.attachment import *
			
 
				+from BaseDataMaintenance.common.Utils import *
			
 
				+from BaseDataMaintenance.common.ossUtils import *
			
 
				+from BaseDataMaintenance.maintenance.product.htmlparser import *
			
 
				+from BaseDataMaintenance.maintenance.product.productUtils import pool_product
			
 
				+import oss2
			
 
				+from BaseDataMaintenance.common.multiThread import MultiThreadHandler
			
 
				+
			
 
				+parameter_status_no_bidfile = -1
			
 
				+parameter_status_to_process = 0
			
 
				+parameter_status_process_succeed = 1
			
 
				+parameter_status_process_failed = 2
			
 
				+parameter_status_process_jump = 3
			
 
				+parameter_status_not_found = 4
			
 
				+
			
 
				+import redis
			
 
				+
			
 
				+from BaseDataMaintenance.java.MQInfo import getAllQueueSize,getQueueSize
			
 
				+
			
 
				+class Product_Attachment_Processor():
			
 
				+
			
 
				+    def __init__(self,):
			
 
				+        self.ots_client = getConnect_ots()
			
 
				+        self.product_attachment_queue = PQueue()
			
 
				+        self.product_attachment_queue_size = 50
			
 
				+        self.set_product_attachment = set()
			
 
				+        self.attachment_hub_url = "https://attachment-hub.oss-cn-hangzhou.aliyuncs.com/"
			
 
				+        self.auth = getAuth()
			
 
				+        oss2.defaults.connection_pool_size = 100
			
 
				+        oss2.defaults.multiget_num_threads = 20
			
 
				+        if is_internal:
			
 
				+            self.bucket_url = "http://oss-cn-hangzhou-internal.aliyuncs.com"
			
 
				+        else:
			
 
				+            self.bucket_url = "http://oss-cn-hangzhou.aliyuncs.com"
			
 
				+        log("bucket_url:%s"%(self.bucket_url))
			
 
				+        self.attachment_bucket_name = "attachment-hub"
			
 
				+        self.bucket = oss2.Bucket(self.auth,self.bucket_url,self.attachment_bucket_name)
			
 
				+        self.current_path = os.path.dirname(__file__)
			
 
				+        self.download_path = "%s/%s"%(self.current_path,"download")
			
 
				+        self.test_url="http://192.168.2.102:15011/convert"
			
 
				+
			
 
				+    def process_parameters_producer(self,):
			
 
				+        attachment_size = getQueueSize("dataflow_attachment")
			
 
				+        if attachment_size<100:
			
 
				+
			
 
				+            _qsize = self.product_attachment_queue.qsize()
			
 
				+            log("product_attachment_queue %d"%(_qsize))
			
 
				+            if _qsize>self.product_attachment_queue_size/3:
			
 
				+                return
			
 
				+            bool_query = BoolQuery(must_queries=[
			
 
				+                TermQuery("parameter_status",parameter_status_to_process)
			
 
				+            ])
			
 
				+            list_id = []
			
 
				+            dict_docid_list = {}
			
 
				+            rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
			
 
				+                                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
			
 
				+                                                                                ColumnsToGet([DOCUMENT_PRODUCT_ATTACHMENTS,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME,DOCUMENT_PRODUCT_DOCID],return_type=ColumnReturnType.SPECIFIED))
			
 
				+
			
 
				+            list_data = getRow_ots(rows)
			
 
				+            _count = 0
			
 
				+            for data in list_data:
			
 
				+                _id = data.get(DOCUMENT_PRODUCT_ID)
			
 
				+                list_id.append(_id)
			
 
				+                if _id in self.set_product_attachment:
			
 
				+                    continue
			
 
				+                docid = data.get(DOCUMENT_PRODUCT_DOCID)
			
 
				+                if docid not in dict_docid_list:
			
 
				+                    dict_docid_list[docid] = []
			
 
				+                dict_docid_list[docid].append(data)
			
 
				+
			
 
				+                _count += 1
			
 
				+            while next_token:
			
 
				+                if len(dict_docid_list.keys())>=self.product_attachment_queue_size:
			
 
				+                    break
			
 
				+                rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
			
 
				+                                                                                    SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
			
 
				+                                                                                    ColumnsToGet([DOCUMENT_PRODUCT_ATTACHMENTS,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME,DOCUMENT_PRODUCT_DOCID],return_type=ColumnReturnType.SPECIFIED))
			
 
				+                list_data = getRow_ots(rows)
			
 
				+                for data in list_data:
			
 
				+                    _id = data.get(DOCUMENT_PRODUCT_ID)
			
 
				+                    list_id.append(_id)
			
 
				+                    if _id in self.set_product_attachment:
			
 
				+                        continue
			
 
				+                    docid = data.get(DOCUMENT_PRODUCT_DOCID)
			
 
				+                    if docid not in dict_docid_list:
			
 
				+                        dict_docid_list[docid] = []
			
 
				+                    dict_docid_list[docid].append(data)
			
 
				+
			
 
				+                    _count += 1
			
 
				+            for k,v in dict_docid_list.items():
			
 
				+                self.product_attachment_queue.put(v)
			
 
				+            _qsize = self.product_attachment_queue.qsize()
			
 
				+            log("after product_attachment_queue %d"%(_qsize))
			
 
				+            self.set_product_attachment = set(list_id)
			
 
				+
			
 
				+    def get_whole_html(self,_filemd5):
			
 
				+        atta = attachment({attachment_filemd5:_filemd5})
			
 
				+        _html = ""
			
 
				+
			
 
				+        db = redis.Redis(connection_pool=pool_product)
			
 
				+        _key = "filemd5:%s"%(_filemd5)
			
 
				+
			
 
				+        _cache_html = None
			
 
				+        try:
			
 
				+            _cache_html = db.get(_key)
			
 
				+        except Exception as e:
			
 
				+            logger.info("get redis cache html error")
			
 
				+        
			
 
				+        if _cache_html is not None:
			
 
				+            _html = _cache_html
			
 
				+        else:
			
 
				+            if atta.fix_columns(self.ots_client,[attachment_path,attachment_filetype,attachment_size],True):
			
 
				+                objectPath = atta.getProperties().get(attachment_path)
			
 
				+                _filetype = atta.getProperties().get(attachment_filetype)
			
 
				+                _size = atta.getProperties().get(attachment_size,0)
			
 
				+                if _size<=0 or _size>=20*1024*1024:
			
 
				+                    return _html
			
 
				+
			
 
				+                # not supported on windows
			
 
				+                # if _filetype in ("doc","xls"):
			
 
				+                #     if len(list_filemd5)==1:
			
 
				+                #         dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_jump,True)
			
 
				+                #         dp.update_row(self.ots_client)
			
 
				+                #         return
			
 
				+                #     else:
			
 
				+                #         continue
			
 
				+
			
 
				+                localpath = "%s/%s.%s"%(self.download_path,_filemd5,_filetype)
			
 
				+                localhtml = "%s/%s.%s"%(self.download_path,_filemd5,"html")
			
 
				+                download_succeed = False
			
 
				+                try:
			
 
				+                    if not os.path.exists(localpath):
			
 
				+                        download_succeed = downloadFile(self.bucket,objectPath,localpath)
			
 
				+                    else:
			
 
				+                        download_succeed = True
			
 
				+                except Exception as e:
			
 
				+                    download_succeed = False
			
 
				+                if download_succeed:
			
 
				+                    try:
			
 
				+                        start_time = time.time()
			
 
				+                        if os.path.exists(localhtml):
			
 
				+                            _html = open(localhtml,"r",encoding="utf8").read()
			
 
				+                            _success = True
			
 
				+                        if len(_html)>10:
			
 
				+                            _success = True
			
 
				+                        else:
			
 
				+                            _data_base64 = base64.b64encode(open(localpath,"rb").read())
			
 
				+
			
 
				+                            _success,_html,swf_images,classification = getAttachDealInterface(_data_base64,_filetype,kwargs={'page_no': '1,-1',"max_bytes":"-1","timeout":6000},timeout=6000)
			
 
				+
			
 
				+                            if _success:
			
 
				+                                db.set(_key,_html,24*60*60)
			
 
				+                                # save for dubug
			
 
				+                                # localhtml = "%s/%s.%s"%(self.download_path,_filemd5,"html")
			
 
				+                                # with open(localhtml,"w",encoding="utf8") as f:
			
 
				+                                #     f.write(_html)
			
 
				+
			
 
				+                    except ConnectionError as e1:
			
 
				+                        if time.time()-start_time>5000:
			
 
				+                            db.set(_key,_html,24*60*60)
			
 
				+                        else:
			
 
				+                            raise e1
			
 
				+                    except Exception as e:
			
 
				+                        traceback.print_exc()
			
 
				+                    finally:
			
 
				+                        try:
			
 
				+                            if os.path.exists(localpath):
			
 
				+                                os.remove(localpath)
			
 
				+                            pass
			
 
				+                        except Exception as e:
			
 
				+                            pass
			
 
				+            else:
			
 
				+                log("attachment %s not exists"%_filemd5)
			
 
				+        return _html
			
 
				+
			
 
				+    def process_parameters_handler(self,list_item,result_queue):
			
 
				+        for item in list_item:
			
 
				+            attachments = item.get(DOCUMENT_PRODUCT_ATTACHMENTS)
			
 
				+            product_name = item.get(DOCUMENT_PRODUCT_NAME)
			
 
				+            product_original_name = item.get(DOCUMENT_PRODUCT_ORIGINAL_NAME)
			
 
				+            list_product = []
			
 
				+            log("processing name:%s original_name:%s attachments:%s"%(product_name,product_original_name,attachments))
			
 
				+            if product_original_name is not None:
			
 
				+                _l = product_original_name.split("_")
			
 
				+                _l.reverse()
			
 
				+                list_product.extend(_l)
			
 
				+            if product_name is not None:
			
 
				+                list_product.append(product_name)
			
 
				+            list_product = list(set(list_product))
			
 
				+            dp = Document_product(item)
			
 
				+            if attachments is None or attachments=="" or len(list_product)==0:
			
 
				+                dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_no_bidfile)
			
 
				+                dp.update_row(self.ots_client)
			
 
				+                return
			
 
				+            list_attachment = json.loads(attachments)
			
 
				+            list_attachment.sort(key=lambda x:0 if x.get("classification")=="招标文件" else 1 if x.get("classification")=="采购清单" else 2)
			
 
				+            list_filemd5 = [a.get("filemd5","") for a in list_attachment]
			
 
				+            _find = False
			
 
				+            _success = False
			
 
				+            list_text = []
			
 
				+            for _filemd5 in list_filemd5:
			
 
				+                _html = self.get_whole_html(_filemd5)
			
 
				+                if len(_html)>5:
			
 
				+
			
 
				+                    pd = ParseDocument(_html,True)
			
 
				+                    for _product in list_product:
			
 
				+                        pd.fix_tree(_product)
			
 
				+                        list_data = pd.tree
			
 
				+                        _text,_count = extract_product_parameters(list_data,_product)
			
 
				+                        if _count>0:
			
 
				+                            _find = True
			
 
				+                        if _text is not None:
			
 
				+                            list_text.append(_text)
			
 
				+
			
 
				+                    pd = ParseDocument(_html,False)
			
 
				+                    for _product in list_product:
			
 
				+                        pd.fix_tree(_product)
			
 
				+                        list_data = pd.tree
			
 
				+                        _text,_count = extract_product_parameters(list_data,_product)
			
 
				+                        if _count>0:
			
 
				+                            _find = True
			
 
				+                        if _text is not None:
			
 
				+                            list_text.append(_text)
			
 
				+                else:
			
 
				+                    log("product attachment process filemd5 %s has no content"%(_filemd5))
			
 
				+                if len(list_text)>0:
			
 
				+                    _text = getBestProductText(list_text,'',[])
			
 
				+                    logger.info("extract_parameter_text bid_filemd5s:%s name:%s original_name:%s parameter_text:%s"%(str(list_filemd5),product_name,product_original_name,_text))
			
 
				+                    dp.setValue(DOCUMENT_PRODUCT_PARAMETER,_text,True)
			
 
				+                    dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_succeed,True)
			
 
				+                    dp.update_row(self.ots_client)
			
 
				+                    _success = True
			
 
				+                    break
			
 
				+
			
 
				+            if not _success:
			
 
				+                if not _find:
			
 
				+                    dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_not_found,True)
			
 
				+                    dp.update_row(self.ots_client)
			
 
				+                else:
			
 
				+                    dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_failed,True)
			
 
				+                    dp.update_row(self.ots_client)
			
 
				+
			
 
				+    def start_process(self):
			
 
				+        self.process_parameters_producer()
			
 
				+        thread_count = 7
			
 
				+        mt = MultiThreadHandler(self.product_attachment_queue,self.process_parameters_handler,None,thread_count,need_stop=False,restart=True)
			
 
				+        mt.run()
			
 
				+
			
 
				+    def process_parameters_comsumer(self,):
			
 
				+        # process_count = 3
			
 
				+        # list_process = []
			
 
				+        # for i in range(process_count):
			
 
				+        #     p = Process(target=self.start_process)
			
 
				+        #     list_process.append(p)
			
 
				+        # for p in list_process:
			
 
				+        #     p.start()
			
 
				+        # for p in list_process:
			
 
				+        #     p.join()
			
 
				+        self.start_process()
			
 
				+
			
 
				+
			
 
				+    def start_process_parameters(self):
			
 
				+        scheduler = BlockingScheduler()
			
 
				+        scheduler.add_job(self.process_parameters_producer,"cron",second="*/20")
			
 
				+        scheduler.add_job(self.process_parameters_comsumer,"cron",second="*/30")
			
 
				+        scheduler.start()
			
 
				+
			
 
				+def start_process_parameters():
			
 
				+    pap = Product_Attachment_Processor()
			
 
				+    pap.start_process_parameters()
			
 
				+
			
 
				+def change_parameters_status():
			
 
				+    ots_client =getConnect_ots()
			
 
				+    bool_query = BoolQuery(must_queries=[
			
 
				+        RangeQuery("parameter_status",-1)
			
 
				+    ],
			
 
				+                           must_not_queries=[
			
 
				+        TermQuery("parameter_status",parameter_status_to_process),
			
 
				+        TermQuery("parameter_status",parameter_status_process_succeed),
			
 
				+        TermQuery("parameter_status",parameter_status_process_jump),
			
 
				+        TermQuery("parameter_status",parameter_status_no_bidfile),
			
 
				+        TermQuery("parameter_status",parameter_status_not_found),
			
 
				+
			
 
				+    ])
			
 
				+    list_data = []
			
 
				+    rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
			
 
				+                                                                        SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("parameter_status")]),limit=100,get_total_count=True),
			
 
				+                                                                        ColumnsToGet([DOCUMENT_PRODUCT_ID],return_type=ColumnReturnType.SPECIFIED))
			
 
				+
			
 
				+    list_data.extend(getRow_ots(rows))
			
 
				+    print("total_count",total_count)
			
 
				+    while next_token:
			
 
				+        rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
			
 
				+                                                                            SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
			
 
				+                                                                            ColumnsToGet([DOCUMENT_PRODUCT_ID],return_type=ColumnReturnType.SPECIFIED))
			
 
				+        list_data.extend(getRow_ots(rows))
			
 
				+    from queue import Queue
			
 
				+    task_queue = Queue()
			
 
				+    for data in list_data:
			
 
				+        task_queue.put(data)
			
 
				+
			
 
				+    def _handle(data,result_queue):
			
 
				+        dp = Document_product(data)
			
 
				+        dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_to_process,True)
			
 
				+        dp.setValue(DOCUMENT_PRODUCT_PARAMETER,"",True)
			
 
				+        dp.update_row(ots_client)
			
 
				+    mt = MultiThreadHandler(task_queue,_handle,None,30)
			
 
				+    mt.run()
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    start_process_parameters()
			
 
				+    # change_parameters_status()
			
--- a/BaseDataMaintenance/maxcompute/documentDumplicate.py
+++ b/BaseDataMaintenance/maxcompute/documentDumplicate.py
--- a/BaseDataMaintenance/model/ots/document.py
+++ b/BaseDataMaintenance/model/ots/document.py
@@ -321,11 +321,15 @@ def turn_document_status():
 
				         #     # must_not_queries=[WildcardQuery("DX004354*")]
			
 
				         # )
			
 
				         bool_query = BoolQuery(
			
 
				-            must_queries=[
			
 
				-                RangeQuery("crtime","2023-08-30 15:00:00","2023-08-30 23:59:59"),
			
 
				-                NestedQuery("page_attachments",ExistsQuery("page_attachments.fileMd5"))
			
 
				-            ],
			
 
				-            must_not_queries=[WildcardQuery("attachmenttextcon","*")]
			
 
				+            # must_queries=[
			
 
				+            #     RangeQuery("crtime","2023-08-30 15:00:00","2023-08-30 23:59:59"),
			
 
				+            #     NestedQuery("page_attachments",ExistsQuery("page_attachments.fileMd5"))
			
 
				+            # ],
			
 
				+            # must_not_queries=[WildcardQuery("attachmenttextcon","*")],
			
 
				+            should_queries=[
			
 
				+                NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer","个体工商户")),
			
 
				+                NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer","机械设备")),
			
 
				+            ]
			
 
				 
			
 
				         )
			
 
				 
			
@@ -337,9 +341,7 @@ def turn_document_status():
 
				         _count = len(list_data)
			
 
				         for _data in list_data:
			
 
				             _document = Document(_data)
			
 
				-            _attachment = _data.get(document_attachmenttextcon,"")
			
 
				-            if _attachment=="":
			
 
				-                task_queue.put(_document)
			
 
				+            task_queue.put(_document)
			
 
				         while next_token:
			
 
				             rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
			
 
				                                                                            SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
			
@@ -349,9 +351,7 @@ def turn_document_status():
 
				             print("%d/%d"%(_count,total_count))
			
 
				             for _data in list_data:
			
 
				                 _document = Document(_data)
			
 
				-                _attachment = _data.get(document_attachmenttextcon,"")
			
 
				-                if _attachment=="":
			
 
				-                    task_queue.put(_document)
			
 
				+                task_queue.put(_document)
			
 
				 
			
 
				         # docids = [223820830,224445409]
			
 
				         # for docid in docids:
			
--- a/BaseDataMaintenance/model/ots/document_html.py
+++ b/BaseDataMaintenance/model/ots/document_html.py
--- a/BaseDataMaintenance/start_product.py
+++ b/BaseDataMaintenance/start_product.py
@@ -11,6 +11,7 @@ def main(args=None):
 
				     parser.add_argument("--search_similar",dest="search_similar",action="store_true",help="start product_dict_synchonize process")
			
 
				     parser.add_argument("--start_process_product",dest="start_process_product",action="store_true",help="start product_dict_synchonize process")
			
 
				     parser.add_argument("--test",dest="test",action="store_true",help="start product_dict_synchonize process")
			
 
				+    parser.add_argument("--start_extract_parameter",dest="start_extract_parameter",action="store_true",help="start extract_parameter")
			
 
				 
			
 
				     args = parser.parse_args(args)
			
 
				     if args.product_dict_synchonize:
			
@@ -28,6 +29,9 @@ def main(args=None):
 
				     if args.test:
			
 
				         from BaseDataMaintenance.maintenance.product.products import test
			
 
				         test()
			
 
				+    if args.start_extract_parameter:
			
 
				+        from BaseDataMaintenance.maintenance.product.product_parameter import start_process_parameters
			
 
				+        start_process_parameters()
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':