Przeglądaj źródła

Merge remote-tracking branch 'origin/master'

znj 1 rok temu
rodzic
commit
644768dabc

+ 6 - 4
BaseDataMaintenance/maintenance/dataflow.py

@@ -499,7 +499,7 @@ class Dataflow():
             return _split
         return []
 
-    def search_data_by_query(self,item,_query,confidence,table_name="document_tmp",table_index="document_tmp_index",sort_column="docid",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count]):
+    def search_data_by_query(self,item,_query,confidence,table_name="document_tmp",table_index="document_tmp_index",sort_column="docid",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count,document_tmp_doctitle]):
 
         list_data = []
         if isinstance(_query,list):
@@ -2205,6 +2205,8 @@ class Dataflow_dumplicate(Dataflow):
         else:
             _dict["project_code"] = ""
         _dict["doctitle_refine"] = _extract.get("doctitle_refine","")
+        if _dict["doctitle_refine"]=="":
+            _dict["doctitle_refine"] = _dict.get("doctitle")
         _dict["nlp_enterprise"] = str({"indoctextcon":_extract.get("nlp_enterprise",[]),
                                        "notindoctextcon":_extract.get("nlp_enterprise_attachment",[])})
         _dict["extract_count"] = self.c_f_get_extractCount.evaluate(extract_json)
@@ -2256,7 +2258,7 @@ class Dataflow_dumplicate(Dataflow):
             return the_group[:_index+1]
         return []
 
-    def dumplicate_check(self,_dict1,_dict2,min_counts,b_log=False):
+    def dumplicate_check(self,_dict1,_dict2,min_counts,b_log=True):
         document_less = _dict1
         docid_less = _dict1["docid"]
         docchannel_less = document_less["docchannel"]
@@ -3894,7 +3896,7 @@ class Dataflow_dumplicate(Dataflow):
                 singleNum_keys = _rule["singleNum_keys"]
                 contain_keys = _rule["contain_keys"]
                 multiNum_keys = _rule["multiNum_keys"]
-                self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district])
+                self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle])
                 _i += step
 
 
@@ -4173,7 +4175,7 @@ if __name__ == '__main__':
     df_dump = Dataflow_dumplicate(start_delete_listener=False)
     # df_dump.start_flow_dumplicate()
     a = time.time()
-    df_dump.test_dumplicate(349638765)
+    df_dump.test_dumplicate(339737931)
     # df_dump.test_merge([292315564],[287890754])
     # df_dump.flow_remove_project_tmp()
     print("takes",time.time()-a)

+ 37 - 25
BaseDataMaintenance/maintenance/dataflow_mq.py

@@ -109,7 +109,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
                                       "html":_html})
                 else:
                     #has process_time then jump
-                    if len(str(_attach.getProperties().get(attachment_process_time,"")))>10 and _attach.getProperties().get(attachment_status)!=ATTACHMENT_INIT:
+                    if len(str(_attach.getProperties().get(attachment_process_time,"")))>10 and _attach.getProperties().get(attachment_status)!=ATTACHMENT_INIT and not (_attach.getProperties().get(attachment_status)>=ATTACHMENT_MC_FAILED_FROM and _attach.getProperties().get(attachment_status)<=ATTACHMENT_MC_FAILED_TO):
                         log("%s has process_time jump"%(_filemd5))
                         _html = _attach.getProperties().get(attachment_attachmenthtml,"")
                         if _html is None:
@@ -161,9 +161,11 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
 
             _dochtmlcon = item.get(document_tmp_dochtmlcon,"")
             dhtml.setValue(document_tmp_dochtmlcon,_dochtmlcon,True)
+            dhtml.delete_bidi_a()
             dtmp = Document_tmp(item)
 
 
+            start_time = time.time()
             #调用识别接口
             _succeed,list_html,swf_urls = self.rec_attachments_by_interface(list_attach,_dochtmlcon,save=True)
 
@@ -208,6 +210,9 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
                 ackMsg(conn,message_id)
             log("document:%d get attachments with result:%s %s retry_times:%d"%(item.get("docid"),str(_succeed),str(_to_ack),_retry_times))
         except Exception as e:
+            traceback.print_exc()
+            if time.time()-start_time<10:
+                item["retry_times"] -= 1
             if send_msg_toacmq(self.pool_mq,json.dumps(item,cls=MyEncoder,ensure_ascii=False),self.mq_attachment):
                 ackMsg(conn,message_id)
 
@@ -1023,6 +1028,7 @@ class Dataflow_init(Dataflow):
             self.get_count = 1000
             self.count = self.get_count
             self.begin_docid = None
+            self.mq_init = "/queue/dataflow_init"
             self.mq_attachment = "/queue/dataflow_attachment"
             self.mq_extract = "/queue/dataflow_extract"
             self.pool_mq1 = ConnectorPool(1,4,getConnect_activateMQ)
@@ -1043,32 +1049,38 @@ class Dataflow_init(Dataflow):
             return next_docid
 
         def on_message(self, headers):
-            next_docid = int(self.getNextDocid())
-            partitionkey = int(next_docid%500+1)
-            message_id = headers.headers["message-id"]
-            body = json.loads(headers.body)
-            body[document_tmp_partitionkey] = partitionkey
-            body[document_tmp_docid] = next_docid
-            if body.get(document_original_docchannel) is None:
-                body[document_original_docchannel] = body.get(document_docchannel)
-            page_attachments = body.get(document_tmp_attachment_path,"[]")
-            _uuid = body.get(document_tmp_uuid,"")
-            if page_attachments!="[]":
-                status = random.randint(1,10)
-                body[document_tmp_status] = status
-                if send_msg_toacmq(self.pool_mq1,json.dumps(body,cls=MyEncoder),self.mq_attachment):
-                    log("uuid:%s with docid:%s"%(str(_uuid),str(next_docid)))
-                    ackMsg(self.conn,message_id)
+            try:
+                next_docid = int(self.getNextDocid())
+                partitionkey = int(next_docid%500+1)
+                message_id = headers.headers["message-id"]
+                body = json.loads(headers.body)
+                body[document_tmp_partitionkey] = partitionkey
+                body[document_tmp_docid] = next_docid
+                if body.get(document_original_docchannel) is None:
+                    body[document_original_docchannel] = body.get(document_docchannel)
+                page_attachments = body.get(document_tmp_attachment_path,"[]")
+                _uuid = body.get(document_tmp_uuid,"")
+                if page_attachments!="[]":
+                    status = random.randint(1,10)
+                    body[document_tmp_status] = status
+                    if send_msg_toacmq(self.pool_mq1,json.dumps(body,cls=MyEncoder),self.mq_attachment):
+                        log("uuid:%s with docid:%s"%(str(_uuid),str(next_docid)))
+                        ackMsg(self.conn,message_id)
+                    else:
+                        log("send_msg_error on init listener")
                 else:
-                    log("send_msg_error on init listener")
-            else:
-                status = random.randint(11,50)
-                body[document_tmp_status] = status
-                if send_msg_toacmq(self.pool_mq1,json.dumps(body,cls=MyEncoder),self.mq_extract):
-                    log("uuid:%s with docid:%s"%(str(_uuid),str(next_docid)))
+                    status = random.randint(11,50)
+                    body[document_tmp_status] = status
+                    if send_msg_toacmq(self.pool_mq1,json.dumps(body,cls=MyEncoder),self.mq_extract):
+                        log("uuid:%s with docid:%s"%(str(_uuid),str(next_docid)))
+                        ackMsg(self.conn,message_id)
+                    else:
+                        log("send_msg_error on init listener")
+            except Exception as e:
+                traceback.print_exc()
+                if send_msg_toacmq(self.pool_mq1,json.dumps(body,cls=MyEncoder),self.mq_init):
+                    log("init error")
                     ackMsg(self.conn,message_id)
-                else:
-                    log("send_msg_error on init listener")
 
         def __del__(self):
             self.conn.disconnect()

+ 14 - 13
BaseDataMaintenance/maintenance/product/1.py

@@ -1,15 +1,16 @@
-
+#coding:utf8
 
 import re
-pattern="(^|★|:|:|\s+)(?P<title_1>(?P<title_1_index_0_0>第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章册部\.::]))|" \
-        "([\s★\*]*)(?P<title_3>(?P<title_3_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?)(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_3_index_0_2>))|" \
-        "([\s★\*]*)(?P<title_4>(?P<title_4_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节章册部\.::、、]))|" \
-        "([\s★\*]*)(?P<title_11>(?P<title_11_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\..、\s\-]?))|" \
-        "([\s★\*]*)(?P<title_10>(?P<title_10_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\..、\s\-]?))|" \
-        "([\s★\*]*)(?P<title_7>(?P<title_7_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\..、\s\-]?))|" \
-        "([\s★\*]*)(?P<title_6>(?P<title_6_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?包?)(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_2_0>[\..、\s\-]?))|" \
-        "([\s★\*]*)(?P<title_15>(?P<title_15_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?(?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>)))|" \
-        "([\s★\*]*)(?P<title_17>(?P<title_17_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?(?)(?P<title_17_index_1_1>[a-wA-W]+)(?P<title_17_index_2_0>)))|" \
-        "([\s★\*]*)(?P<title_19>(?P<title_19_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>[))]))" \
-        ""
-print(re.search(pattern,"(一)4K内窥镜荧光摄像系统主机").groupdict())
+from bs4 import BeautifulSoup
+
+p = '''
+包名称:包B:电脑恒温电蜡疗仪,全自动红外母乳分析仪,生物反馈治疗仪,磁刺激仪,多参数生物反馈仪、婴幼儿养育照护指导中心综合管理平台、多功能婴儿培养箱供应商名称:济南旭博医疗设备有限公司
+<table border="1"><tbody><tr><td colspan="1">货物名称</td><td colspan="1">品牌</td><td colspan="1">产地</td><td colspan="1">规格要求</td><td colspan="1">单价(元)/优惠率</td><td colspan="1">数量/单位</td></tr><tr><td colspan="1">婴幼儿养育照护指导中心综合管理平台</td><td colspan="1">北京零六</td><td colspan="1">北京/北京零六爱成长健康科技有限公司</td><td colspan="1">爱成长</td><td colspan="1">220000.000000</td><td colspan="1">1套</td></tr><tr><td colspan="1">电脑恒温电蜡疗仪</td><td colspan="1">苏州好博 </td><td colspan="1">苏州/苏州好博医疗器械股份有限公司</td><td colspan="1">HB-LY3</td><td colspan="1">104000.000000</td><td colspan="1">1台</td></tr><tr><td colspan="1">全自动红外母乳分析仪</td><td colspan="1">泰安康宇</td><td colspan="1">泰安/泰安市康宇医疗器械有限公司</td><td colspan="1">KY-9002</td><td colspan="1">200000.000000</td><td colspan="1">1台</td></tr><tr><td colspan="1">多功能婴儿培养箱</td><td colspan="1">宁波戴维</td><td colspan="1">宁波/宁波戴维医疗器械股份有限公司</td><td colspan="1">YP-3000</td><td colspan="1">302000.000000</td><td colspan="1">2台</td></tr><tr><td colspan="1">多参数生物反馈仪</td><td colspan="1">南京伟思</td><td colspan="1">南京/南京伟思医疗科技股份有限公司</td><td colspan="1">Infiniti3000C</td><td colspan="1">220000.000000</td><td colspan="1">1台</td></tr><tr><td colspan="1">生物反馈治疗仪</td><td colspan="1">南京锐诗得</td><td colspan="1">南京/南京锐诗得医疗科技有限公司</td><td colspan="1">RSD RM4</td><td colspan="1">87000.000000</td><td colspan="1">1台</td></tr><tr><td colspan="1">磁刺激仪</td><td colspan="1">南京伟思</td><td colspan="1">南京/南京伟思医疗科技股份有限公司</td><td colspan="1">Magneuro100HZ</td><td colspan="1">355000.000000</td><td colspan="1">1台</td></tr></tbody></table>
+
+'''
+_text = BeautifulSoup(p,"html5lib").get_text()
+print(_text)
+meter_pattern = "[><≤≥±]\d+|\d+(?:[μucmkK微毫千]?[米升LlgGmMΩ]|摄氏度|英寸|度|天|VA|dB|bpm|rpm|kPa|mol|cmH20|%|°|Mpa|Hz|K?HZ|℃|W|min|[*×xX])|[*×xX]\d+|/min|\ds[^a-zA-Z]|GB.{,20}标准|PVC|PP|角度|容积|色彩|自动|流量|外径|轴位|折射率|帧率|柱镜|振幅|磁场|镜片|防漏|强度|允差|心率|倍数|瞳距|底座|色泽|噪音|间距|材质|材料|表面|频率|阻抗|浓度|兼容|防尘|防水|内径|实时|一次性|误差|性能|距离|精确|温度|超温|范围|跟踪|对比度|亮度|[横纵]向|均压|负压|正压|可调|设定值|功能|检测|高度|厚度|宽度|深度|[单双多]通道|效果|指数|模式|尺寸|重量|峰值|谷值|容量|寿命|稳定性|高温|信号|电源|电流|转换率|效率|释放量|转速|离心力|向心力|弯曲|电压|功率|气量|国标|标准协议|灵敏度|最大值|最小值|耐磨|波形|高压|性强|工艺|光源|低压|压力|压强|速度|湿度|重量|毛重|[MLX大中小]+码|净重|颜色|[红橙黄绿青蓝紫]色|不锈钢|输入|输出|噪声|认证|配置"
+not_meter_pattern = "投标报价|中标金额|商务部分|公章|分值构成|业绩|详见|联系人|联系电话|合同价|金额|采购预算|资金来源|费用|质疑|评审因素|评审标准|商务资信|商务评分|总价|专家论证意见|评标方法|代理服务费|售后服务|邮政编码|评分类型|评分项目|预算金额|得\d+分|项目金额|详见招标文件|乙方|甲方|合同|报价|采购人|技术支持服务"
+print(list(set(re.findall(meter_pattern,_text))))
+print(list(set(re.findall(not_meter_pattern,_text))))

+ 313 - 147
BaseDataMaintenance/maintenance/product/htmlparser.py

@@ -2,22 +2,22 @@
 
 import re
 
-from BaseDataMaintenance.maintenance.product.productUtils import *
+from BaseDataMaintenance.maintenance.product.productUtils import is_similar
 import logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
-
-
+logger.setLevel(logging.INFO)
 
 
 from bs4 import BeautifulSoup
 import copy
 
 end_pattern = "商务要求|评分标准|商务条件|商务条件"
-_param_pattern = "(产品|技术|清单[及和]?|配置|参数|具体|明细[及和]?|项目|货物|服务)(指标|配置|要求|参数|需求|规格)|配置清单|(质量|技术).{,10}要求|验收标准|^参数$"
-meter_pattern = "角度|容积|色彩|帧率|磁场|强度|允差|噪音|材质|频率|阻抗|浓度|范围|误差|精确|温度|可调|设定值|功能|检测|高度|宽度|模式|尺寸|重量|峰值|容量|寿命|稳定性|高温|电源|电压|功率|压力|压强"
+_param_pattern = "(产品|技术|清单|配置|参数|具体|明细|项目|招标|货物|服务|规格|工作|具体)[及和与]?(指标|配置|条件|要求|参数|需求|规格|条款|名称及要求)|配置清单|(质量|技术).{,10}要求|验收标准|^(参数|功能)$"
+meter_pattern = "[><≤≥±]\d+|\d+(?:[μucmkK微毫千]?[米升LlgGmMΩ]|摄氏度|英寸|度|天|VA|dB|bpm|rpm|kPa|mol|cmH20|%|°|Mpa|Hz|K?HZ|℃|W|min|[*×xX])|[*×xX]\d+|/min|\ds[^a-zA-Z]|GB.{,20}标准|PVC|PP|角度|容积|色彩|自动|流量|外径|轴位|折射率|帧率|柱镜|振幅|磁场|镜片|防漏|强度|允差|心率|倍数|瞳距|底座|色泽|噪音|间距|材质|材料|表面|频率|阻抗|浓度|兼容|防尘|防水|内径|实时|一次性|误差|性能|距离|精确|温度|超温|范围|跟踪|对比度|亮度|[横纵]向|均压|负压|正压|可调|设定值|功能|检测|高度|厚度|宽度|深度|[单双多]通道|效果|指数|模式|尺寸|重量|峰值|谷值|容量|寿命|稳定性|高温|信号|电源|电流|转换率|效率|释放量|转速|离心力|向心力|弯曲|电压|功率|气量|国标|标准协议|灵敏度|最大值|最小值|耐磨|波形|高压|性强|工艺|光源|低压|压力|压强|速度|湿度|重量|毛重|[MLX大中小]+码|净重|颜色|[红橙黄绿青蓝紫]色|不锈钢|输入|输出|噪声|认证|配置"
+not_meter_pattern = "投标报价|中标金额|商务部分|公章|分值构成|业绩|详见|联系人|联系电话|合同价|金额|采购预算|资金来源|费用|质疑|评审因素|评审标准|商务资信|商务评分|专家论证意见|评标方法|代理服务费|售后服务|评分类型|评分项目|预算金额|得\d+分|项目金额|详见招标文件|乙方"
+
 
 def getTrs(tbody):
     #获取所有的tr
@@ -128,7 +128,7 @@ class ParseDocument():
         _body = self.soup.find("body")
         if _body is not None:
             self.soup = _body
-        self.list_obj = self.soup.find_all(recursive=False)
+        self.list_obj = self.get_soup_objs(self.soup)
 
         # for obj in self.list_obj:
         #     print("obj",obj.get_text()[:20])
@@ -140,6 +140,18 @@ class ParseDocument():
         # if self.parseTree:
         #     self.parseTree.printParseTree()
 
+    def get_soup_objs(self,soup,list_obj=None):
+        if list_obj is None:
+            list_obj = []
+        childs = soup.find_all(recursive=False)
+        for _obj in childs:
+            childs1 = _obj.find_all(recursive=False)
+            if len(childs1)==0 or len(_obj.get_text())<40 or _obj.name=="table":
+                list_obj.append(_obj)
+            else:
+                self.get_soup_objs(_obj,list_obj)
+        return list_obj
+
     def fix_tree(self,_product):
         products = extract_products(self.tree,_product)
         if len(products)>0:
@@ -148,12 +160,16 @@ class ParseDocument():
     def print_tree(self,tree,append=""):
         if append=="":
             self.set_tree_id = set()
+
+            # for t in tree:
+            #     logger.debug("%s text:%s title:%s title_text:%s before:%s after%s product:%s"%("==>",t["text"][:50],t["sentence_title"],t["sentence_title_text"],t["title_before"],t["title_after"],t["has_product"]))
+
         for t in tree:
             _id = id(t)
             if _id in self.set_tree_id:
                 continue
             self.set_tree_id.add(_id)
-            logger.debug("%s %s %s %s %s"%(append,t["text"][:50],t["sentence_title"],t["title_before"],t["title_after"]))
+            logger.debug("%s text:%s title:%s title_text:%s before:%s after%s product:%s"%(append,t["text"][:50],t["sentence_title"],t["sentence_title_text"],t["title_before"],t["title_after"],t["has_product"]))
             childs = t["child_title"]
             self.print_tree(childs,append=append+"-|")
 
@@ -162,18 +178,18 @@ class ParseDocument():
             return True
         return False
 
-    def find_title_by_pattern(self,_text,_pattern="(^|★|▲|:|:|\s+)(?P<title_1>(?P<title_1_index_0_0>第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章册包标部\.::]))|" \
-                                             "([\s★▲\*]*)(?P<title_3>(?P<title_3_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?)(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_3_index_0_2>))|" \
-                                             "([\s★▲\*]*)(?P<title_4>(?P<title_4_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节章册部\.::、、]))|" \
+    def find_title_by_pattern(self,_text,_pattern="(^|★|▲|:|:|\s+)(?P<title_1>(?P<title_1_index_0_0>第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章册包标部.::、、]+))|" \
+                                             "([\s★▲\*]*)(?P<title_3>(?P<title_3_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?)(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_3_index_0_2>[、章册包标部.::、、]+))|" \
+                                             "([\s★▲\*]*)(?P<title_4>(?P<title_4_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节章册部\.::、、]+))|" \
                                              "([\s★▲\*]*)(?P<title_5>(?P<title_5_index_0_0>^)(?P<title_5_index_1_1>[一二三四五六七八九十]+)(?P<title_5_index_2_0>)[^一二三四五六七八九十节章册部\.::、、])|" \
                                              "([\s★▲\*]*)(?P<title_12>(?P<title_12_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_12_index_1_1>\d{1,2})(?P<title_12_index_2_0>[\..、\s\-]?))|"\
                                              "([\s★▲\*]*)(?P<title_11>(?P<title_11_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\..、\s\-]?))|" \
                                              "([\s★▲\*]*)(?P<title_10>(?P<title_10_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\..、\s\-]?))|" \
-                                             "([\s★▲\*]*)(?P<title_7>(?P<title_7_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\..包标::、\s\-]?))|" \
-                                             "([\s★▲\*]*)(?P<title_6>(?P<title_6_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?包?)(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_2_0>[\..、\s\-包标]?))|" \
-                                             "([\s★▲\*]*)(?P<title_15>(?P<title_15_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>[))包标]))|" \
-                                             "([\s★▲\*]*)(?P<title_17>(?P<title_17_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_17_index_1_1>[a-wA-W]+)(?P<title_17_index_2_0>[))包标]))|" \
-                                             "([\s★▲\*]*)(?P<title_19>(?P<title_19_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>[))]))" \
+                                             "([\s★▲\*]*)(?P<title_7>(?P<title_7_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\..包标::、\s\-]*))|" \
+                                             "(^[\s★▲\*]*)(?P<title_6>(?P<title_6_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?包?)(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_2_0>[\..、\s\-包标]*))|" \
+                                             "([\s★▲\*]*)(?P<title_15>(?P<title_15_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>[))包标\..::、]+))|" \
+                                             "([\s★▲\*]*)(?P<title_17>(?P<title_17_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_17_index_1_1>[a-zA-Z]+)(?P<title_17_index_2_0>[))包标\..::、]+))|" \
+                                             "([\s★▲\*]*)(?P<title_19>(?P<title_19_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>[))]))"
                               ):
         _se = re.search(_pattern,_text)
         groups = []
@@ -324,6 +340,7 @@ class ParseDocument():
             max_length = max(list_length)
         else:
             max_length = 40
+        max_length = min(max_length,40)
 
         logger.debug("%s:%d"%("max_length",max_length))
 
@@ -335,6 +352,9 @@ class ParseDocument():
         dict_before,illegal_sentence = self.count_title_before(list_obj)
         for obj_i in range(len(list_obj)):
             obj = list_obj[obj_i]
+
+            # logger.debug("==obj %s"%obj.text[:20])
+
             _type = "sentence"
             _text = standard_product(obj.text)
             if obj.name=="table":
@@ -355,36 +375,62 @@ class ParseDocument():
             list_table = None
             block = False
 
+            has_product = False
+
             if _type=="sentence":
                 if _text in illegal_sentence:
                     continue
 
-                _fix = False
-                for p in products:
-                    if re.sub("^(\d[.、]?)+","",_text.strip())==p:
-                        title_before = "=产品"
-                        sentence_title = "title_0"
-                        sentence_title_text = p
-                        title_index = "0"
-                        title_after = "产品="
-                        next_index = "0"
-                        _fix = True
-                        break
 
-                if not _fix:
-                    sentence_groups = self.find_title_by_pattern(_text[:10])
-                    if sentence_groups:
-                        title_before = standard_title_context(sentence_groups[1][1])
-                        if title_before in dict_before and dict_before[title_before]>1:
-                            sentence_title = sentence_groups[0][0]
-                            sentence_title_text = sentence_groups[0][1]
-                            title_index = sentence_groups[-2][1]
-
-                            title_after = sentence_groups[-1][1]
-                            next_index = self.get_next_title(title_index)
-                        else:
-                            title_before = None
+                sentence_groups = self.find_title_by_pattern(_text[:10])
+                if sentence_groups:
+                    title_before = standard_title_context(sentence_groups[1][1])
+                    title_after = sentence_groups[-1][1]
+                    sentence_title_text = sentence_groups[0][1]
+                    other_text = _text.replace(sentence_title_text,"")
+                    if (title_before in dict_before and dict_before[title_before]>1) or title_after!="":
+                        sentence_title = sentence_groups[0][0]
+
+                        title_index = sentence_groups[-2][1]
+                        next_index = self.get_next_title(title_index)
+
+                        other_text = _text.replace(sentence_title_text,"")
+
+                        for p in products:
+                            if other_text.strip()==p.strip():
+                                has_product = True
 
+                    else:
+                        _fix = False
+
+                        for p in products:
+                            if other_text.strip()==p.strip():
+                                title_before = "=产品"
+                                sentence_title = "title_0"
+                                sentence_title_text = p
+                                title_index = "0"
+                                title_after = "产品="
+                                next_index = "0"
+                                _fix = True
+                                has_product = True
+                                break
+                        if not _fix:
+                            title_before = None
+                            title_after = None
+                            sentence_title_text = None
+                else:
+                    if len(_text)<40 and re.search(_param_pattern,_text) is not None:
+                        for p in products:
+                            if _text.find(p)>=0:
+                                title_before = "=产品"
+                                sentence_title = "title_0"
+                                sentence_title_text = p
+                                title_index = "0"
+                                title_after = "产品="
+                                next_index = "0"
+                                _fix = True
+                                has_product = True
+                                break
 
             if _type=="sentence":
                 if sentence_title is None and len(list_data)>0 and list_data[-1]["sentence_title"] is not None and list_data[-1]["line_width"]>=max_length*0.6:
@@ -402,6 +448,8 @@ class ParseDocument():
                 _table = _soup.find("table")
                 if _table is not None:
                     list_table = getTable(_table)
+                    if len(list_table)==0:
+                        continue
                     table_columns = len(list_table[0])
 
                     if auto_merge_table:
@@ -428,7 +476,7 @@ class ParseDocument():
                 _data = {"type":_type, "text":_text,"list_table":list_table,"line_width":len(_text),"sentence_title":sentence_title,"title_index":title_index,
                          "sentence_title_text":sentence_title_text,"sentence_groups":sentence_groups,"parent_title":parent_title,
                          "child_title":childs,"title_before":title_before,"title_after":title_after,"title_next":title_next,"next_index":next_index,
-                         "block":block}
+                         "block":block,"has_product":has_product}
 
                 if _type=="table":
                     last_table = _data
@@ -543,22 +591,56 @@ class ParseDocument():
 
                 list_data.append(_data)
 
+        for _data in list_data:
+
+            childs = _data["child_title"]
+
+            for c_i in range(len(childs)):
+                cdata = childs[c_i]
+                if cdata["has_product"]:
+                    continue
+                else:
+                    if c_i>0:
+                        last_cdata = childs[c_i-1]
+                        if cdata["sentence_title"] is not None and last_cdata["sentence_title"] is not None and last_cdata["title_before"]==cdata["title_before"] and last_cdata["title_after"]==cdata["title_after"] and last_cdata["has_product"]:
+                            cdata["has_product"] = True
+                    if c_i<len(childs)-1:
+                        last_cdata = childs[c_i+1]
+                        if cdata["sentence_title"] is not None and last_cdata["sentence_title"] is not None and last_cdata["title_before"]==cdata["title_before"] and last_cdata["title_after"]==cdata["title_after"] and last_cdata["has_product"]:
+                            cdata["has_product"] = True
+            for c_i in range(len(childs)):
+                cdata = childs[len(childs)-1-c_i]
+                if cdata["has_product"]:
+                    continue
+                else:
+                    if c_i>0:
+                        last_cdata = childs[c_i-1]
+                        if cdata["sentence_title"] is not None and last_cdata["sentence_title"] is not None and last_cdata["title_before"]==cdata["title_before"] and last_cdata["title_after"]==cdata["title_after"] and last_cdata["has_product"]:
+                            cdata["has_product"] = True
+                    if c_i<len(childs)-1:
+                        last_cdata = childs[c_i+1]
+                        if cdata["sentence_title"] is not None and last_cdata["sentence_title"] is not None and last_cdata["title_before"]==cdata["title_before"] and last_cdata["title_after"]==cdata["title_after"] and last_cdata["has_product"]:
+                            cdata["has_product"] = True
+
+
         return list_data
 
+
 def standard_title_context(_title_context):
     return _title_context.replace("(","(").replace(")",")").replace(":",":").replace(":",";").replace(",",".").replace(",",".").replace("、",".").replace(".",".")
 
 def standard_product(sentence):
     return sentence.replace("(","(").replace(")",")")
 
-def extract_products(list_data,_product,_param_pattern = "产品名称|采购内存|标的名称|采购内容|(标的|维修|系统|报价构成|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|配件|资产|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名|气体|标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|中标|成交|工程|招标内容)[\))的]?([、\w]{,4}名称|内容|描述)|标的|标项|项目$|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品|物装|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|菜名|^品目$|^品名$|^名称|^内容$"):
+def extract_products(list_data,_product,_param_pattern = "产品名称|设备材料|采购内存|标的名称|采购内容|(标的|维修|系统|报价构成|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|配件|资产|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名|气体|标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|中标|成交|工程|招标内容)[\))的]?([、\w]{,4}名称|内容|描述)|标的|标项|项目$|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品|物装|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|菜名|^品目$|^品名$|^名称|^内容$"):
     _product = standard_product(_product)
     list_result = []
+    list_table_products = []
     for _data_i in range(len(list_data)):
         _data = list_data[_data_i]
         _type = _data["type"]
         _text = _data["text"]
-        table_products = []
+
         if _type=="table":
             list_table = _data["list_table"]
             if list_table is None:
@@ -584,6 +666,8 @@ def extract_products(list_data,_product,_param_pattern = "产品名称|采购内
             for line_i in range(len(list_table)):
                 line = list_table[line_i]
                 for cell_i in list_head_index:
+                    if cell_i>=len(line):
+                        continue
                     cell = line[cell_i]
                     cell_text = cell[0]
                     head_cell_text += cell_text
@@ -592,27 +676,29 @@ def extract_products(list_data,_product,_param_pattern = "产品名称|采购内
             if re.search("招标人|采购人|项目编号|项目名称|金额|^\d+$",head_cell_text) is not None:
                 list_head_index = []
 
-
             for line in list_table:
                 line_text = ",".join([cell[0] for cell in line])
                 for cell_i in range(len(line)):
                     cell = line[cell_i]
                     cell_text = cell[0]
-                    if cell_text is not None and _product is not None and len(cell_text)<len(_product)*10 and re.search(_product,cell_text) is not None and re.search("单价|数量|总价|规格|品牌|型号|用途|要求|采购量",line_text) is not None:
+                    if cell_text is not None and _product is not None and len(cell_text)<len(_product)*10 and cell_text.find(_product)>=0 and re.search("单价|数量|总价|规格|品牌|型号|用途|要求|采购量",line_text) is not None:
                         list_head_index.append(cell_i)
 
             list_head_index = list(set(list_head_index))
             if len(list_head_index)>0:
-                for line_i in range(_begin_index,len(list_table)):
-                    line = list_table[line_i]
-                    has_number = False
-                    for cell_i in range(len(line)):
-                        cell = line[cell_i]
-                        cell_text = cell[0]
-                        if re.search("^\d+$",cell_text) is not None:
-                            has_number = True
+                has_number = False
+                for cell_i in list_head_index:
+                    table_products = []
+
+                    for line_i in range(_begin_index,len(list_table)):
+                        line = list_table[line_i]
+
+                        for _i in range(len(line)):
+                            cell = line[_i]
+                            cell_text = cell[0]
+                            if re.search("^\d+$",cell_text) is not None:
+                                has_number = True
 
-                    for cell_i in list_head_index:
                         if cell_i>=len(line):
                             continue
                         cell = line[cell_i]
@@ -621,10 +707,23 @@ def extract_products(list_data,_product,_param_pattern = "产品名称|采购内
                             if re.search("^[\da-zA-Z]+$",cell_text) is None:
                                 table_products.append(cell_text)
 
-        if len(table_products)>0:
-            if min([len(x) for x in table_products])>0 and max([len(x) for x in table_products])<=20:
-                list_result.extend(table_products)
-    list_result = list(set([a for a in list_result if len(a)>1 and len(a)<20 and re.search("预算|合计|金额|万元|运费",a) is None]))
+                    if len(table_products)>0:
+                        logger.debug("table products %s"%(str(table_products)))
+                        if min([len(x) for x in table_products])>0 and max([len(x) for x in table_products])<=30:
+                            if re.search("招标人|代理人|预算|数量|交货期|品牌|产地","".join(table_products)) is None:
+                                list_table_products.append(table_products)
+    _find = False
+    for table_products in list_table_products:
+        for _p in table_products:
+            if is_similar(_product,_p,90):
+                _find = True
+                logger.debug("similar table_products %s"%(str(table_products)))
+                list_result = list(set([a for a in table_products if len(a)>1 and len(a)<20 and re.search("费用|预算|合计|金额|万元|运费|^其他$",a) is None]))
+                break
+    if not _find:
+        for table_products in list_table_products:
+            list_result.extend(table_products)
+        list_result = list(set([a for a in list_result if len(a)>1 and len(a)<30 and re.search("费用|预算|合计|金额|万元|运费",a) is None]))
     return list_result
 
 
@@ -671,12 +770,20 @@ def get_correct_product(product,products):
 def get_childs_text(childs,_product,products,is_begin=False,is_end=False):
     _text = ""
 
+    end_next = False
     for _child in childs:
 
         child_text = _child.get("text")
 
+
         if child_text.find(_product)>=0:
-            is_begin = True
+            if not is_begin:
+                is_begin = True
+                if not end_next:
+                    if _child["sentence_title"] is not None and isinstance(_child["title_next"],dict) and _child["title_next"]["sentence_title"] is not None:
+                        end_next = True
+                        end_title = _child["title_next"]
+                        logger.debug("end_title %s "%end_title["text"])
 
         logger.debug("%s-%s-%s"%("get_childs_text",child_text[:10],str(is_begin)))
 
@@ -684,13 +791,15 @@ def get_childs_text(childs,_product,products,is_begin=False,is_end=False):
             if child_text.find(p)>=0 and is_similar(_product,p,90):
                 is_begin = True
 
-            if child_text.find(_product)<0 and  child_text.find(p)>=0 and not is_similar(_product,p,80):
+            if child_text.find(_product)<0  and not is_similar(_product,p,80) and  (child_text.find(p)>=0 or _child["has_product"]):
                 if is_begin:
                     is_end = True
+                    logger.debug("%s-%s-%s"%("get_childs_text end",child_text[:10],p))
                 break
         if re.search(end_pattern,child_text) is not None:
             if is_begin:
                 is_end = True
+                logger.debug("%s-%s-%s"%("get_childs_text end",child_text[:10],str(is_end)))
 
         if is_begin and is_end:
             break
@@ -699,45 +808,58 @@ def get_childs_text(childs,_product,products,is_begin=False,is_end=False):
             _text += _child.get("text")+"\r\n"
         childs2 = _child.get("child_title",[])
 
+
         if len(childs2)>0:
             for _child2 in childs2:
                 child_text,is_begin,is_end = get_childs_text([_child2],_product,products,is_begin)
-                if is_begin and is_end:
-                    break
-                else:
-                    if is_begin:
-                        _text += child_text
+                if is_begin:
+                    _text += child_text
+                    if is_end:
+                        break
+
+        if end_next:
+            is_end = True
+
+    #     logger.debug("%s-%s-%s"%("get_childs_text1",_text,str(is_begin)))
+    # logger.debug("%s-%s-%s"%("get_childs_text2",_text,str(is_begin)))
     return _text,is_begin,is_end
 
 def extract_parameters_by_tree(_product,products,list_data,_data_i,parent_title,list_result,):
     _data = list_data[_data_i]
     childs = _data.get("child_title",[])
     if len(childs)>0:
-        child_text,_,_ = get_childs_text([parent_title],_product,products)
-        logger.info("extract_parameters_by_tree child_text:%s"%child_text)
+        child_text,_,_ = get_childs_text([_data],_product,products)
         if len(child_text)>0:
+            logger.info("extract_type by_tree child_text:%s"%child_text)
             list_result.append(child_text)
-            return True
     if parent_title is not None:
+        child_text,_,_ = get_childs_text([parent_title],_product,products)
+        if len(child_text)>0:
+            logger.info("extract_type by_tree child_text:%s"%child_text)
+            list_result.append(child_text)
+
         childs = parent_title.get("child_title",[])
         if len(childs)>0:
 
             range_data = get_range_data_by_childs(list_data[_data_i:],childs)
             p_text = ""
             _find = False
+            end_id = id(_data["title_next"]) if isinstance(_data["sentence_title"],dict) and _data["title_next"] is not None and _data["title_next"]["sentence_title"] is not None else None
             for pdata in range_data:
-                ptype = _data["type"]
                 ptext = pdata["text"]
                 for p in products:
-                    if ptext.find(_product)<0 and  ptext.find(p)>=0:
+                    if ptext.find(_product)<0 and  (ptext.find(p)>=0 or pdata["has_product"]):
                         _find = True
                         break
                 if re.search(end_pattern,ptext) is not None:
                     _find = True
                 if _find:
                     break
+                if id(pdata)==end_id:
+                    break
                 p_text += ptext+"\r\n"
             if len(p_text)>0:
+                logger.debug("extract_type by parent range_text:%s"%p_text)
                 list_result.append(p_text)
                 return True
     return False
@@ -766,6 +888,7 @@ def get_table_pieces(_text,_product,products,list_result,_find):
                 list_trs.append(tr)
         if len(list_trs)>0:
             table_html = "<table>%s</table>"%("\r\n".join([str(a) for a in list_trs]))
+            logger.debug("extract_type table slices %s"%(table_html))
             list_result.append(table_html)
 
 def extract_parameters_by_table(_product,products,_param_pattern,list_data,_data_i,list_result):
@@ -778,8 +901,9 @@ def extract_parameters_by_table(_product,products,_param_pattern,list_data,_data
         max_length = max([len(a) for a in list_table])
         min_length = min([len(a) for a in list_table])
         text_line_first = ",".join(a[0] for a in list_table[0])
-        if min_length<max_length/2:
-            return
+        if max_length>10:
+            if min_length<max_length/2:
+                return
         last_data = list_data[_data_i-1]
         _flag = False
         if last_data["type"]=="sentence" and last_data["text"].find(_product)>=0:
@@ -789,8 +913,8 @@ def extract_parameters_by_table(_product,products,_param_pattern,list_data,_data
         if re.search(_param_pattern,text_line_first) is not None and text_line_first.find(_product)>=0:
             _flag = True
         if _flag:
-            logger.debug("extract_type add all table %s"%_text)
             if len(products)==0:
+                logger.debug("extract_type whole table by param and product %s"%(_text))
                 list_result.append(_text)
             else:
                 for p in products:
@@ -813,20 +937,99 @@ def extract_parameters_by_table(_product,products,_param_pattern,list_data,_data
             for line in list_table:
                 for cell in line:
                     cell_text = cell[0]
-                    if len(cell_text)>50 and len(re.findall("\d+",cell_text))>10 and cell_text.find(_product)>=0:
-                        list_result.append(cell_text)
+                    if len(cell_text)>50 and len(re.findall(meter_pattern,cell_text))>5 and cell_text.find(_product)>=0:
+                        _f = True
+                        for cell in line:
+                            if not _f:
+                                break
+                            cell_text = cell[0]
+                            for p in products:
+                                if cell_text.find(p)>=0 and p!=_product:
+                                    _f = False
+                                    break
+                        if _f:
+                            logger.debug("extract_type param column %s"%(cell_text))
+                            list_result.append(cell_text)
                     if len(cell_text)<len(_product)*10 and str(cell_text).find(_product)>=0:
                         for _index in list_head_index:
                             if _index>=len(line):
                                 continue
                             _cell = line[_index]
                             if len(cell[0])>0:
-                                logger.info("%s-%s"%("add on table",_cell[0]))
+                                logger.info("%s-%s"%("extract_type add on table text:",_cell[0]))
                                 list_result.append(_cell[0])
         if not _flag and (re.search(_param_pattern,_text) is not None or (parent_title is not None and re.search(_param_pattern,parent_title["text"]) is not None)) and _text.find(_product)>=0:
             get_table_pieces(_text,_product,products,list_result,False)
 
 
+def extract_parameters_by_sentence(list_data,_data,_data_i,_product,products,list_result,is_project):
+    _text = _data["text"]
+    if _text.find(_product)>=0:
+        parent_title = _data.get("parent_title")
+        parent_text = ""
+        parent_parent_title = None
+        parent_parent_text = ""
+        parent_title_index = None
+        parent_parent_title_index = None
+        childs = get_childs([_data])
+
+        child_find = False
+        for c in childs:
+            if re.search(_param_pattern,c["text"]) is not None and len(c["text"])<30:
+                logger.debug("child text %s"%(c["text"]))
+                child_find = True
+                break
+
+        extract_text,_,_ = get_childs_text([_data],_product,products)
+        logger.debug("childs found extract_text %s %s"%(str(child_find),extract_text))
+        if child_find:
+            if len(extract_text)>0:
+                list_result.append(extract_text)
+        else:
+            limit_nums = len(_product)*2+5
+            if len(_product)<=3:
+                limit_nums += 6
+            if _text.find("数量")>=0:
+                limit_nums += 6
+            if len(_text)<=limit_nums and _data["sentence_title"] is not None:
+                if re.search(meter_pattern,extract_text) is not None:
+                    list_result.append(extract_text)
+            elif len(re.findall(meter_pattern,extract_text))>2:
+                list_result.append(extract_text)
+
+        if parent_title is not None:
+            parent_text = parent_title.get("text","")
+            parent_parent_title = parent_title.get("parent_title")
+            parent_title_index = parent_title["title_index"]
+            if parent_parent_title is not None:
+                parent_parent_text = parent_parent_title.get("text","")
+                parent_parent_title_index = parent_parent_title["title_index"]
+
+        _suit = False
+        if re.search(_param_pattern,_text) is not None and len(_text)<50:
+            _suit = True
+        if re.search(_param_pattern,parent_text) is not None and len(parent_text)<50:
+            _suit = True
+        if re.search(_param_pattern,parent_parent_text) is not None and len(parent_parent_text)<50:
+            _suit = True
+        if _suit:
+            logger.debug("extract_type sentence %s"%("extract_parameters_by_tree"))
+            if not extract_parameters_by_tree(_product,products,list_data,_data_i,parent_title,list_result):
+                logger.debug("extract_type sentence %s"%("extract_parameters_by_tree"))
+                extract_parameters_by_tree(_product,products,list_data,_data_i,parent_parent_title,list_result)
+
+    if re.search(_param_pattern,_text) is not None and len(_text)<50:
+        childs = _data["child_title"]
+        if len(childs)>0:
+            extract_text,_,_ = get_childs_text([_data],_product,products)
+            if len(extract_text)>0:
+                logger.debug("extract_type param-product %s"%(extract_text))
+                list_result.append(extract_text)
+            elif is_project:
+                extract_text,_,_ = get_childs_text([_data],_product,products,is_begin=True)
+                if len(extract_text)>0 and re.search(meter_pattern,extract_text) is not None:
+                    logger.debug("extract_type sentence is_project param-product is product %s"%(extract_text))
+                    list_result.append(extract_text)
 
 def getBestProductText(list_result,_product,products):
     list_result.sort(key=lambda x:len(re.findall(meter_pattern+"|"+'[::;;]|\d+[%A-Za-z]+',BeautifulSoup(x,"html5lib").get_text())), reverse=True)
@@ -840,7 +1043,7 @@ def getBestProductText(list_result,_product,products):
         _result = list_result[i]
         _check = True
         _result_text = BeautifulSoup(_result,"html5lib").get_text()
-        _search = re.search("项目编号[::]|项目名称[::]|联合体投标",_result)
+        _search = re.search("项目编号[::]|项目名称[::]|联合体投标|开户银行",_result)
         if _search is not None:
             logger.debug("result%d error illegal text %s"%(i,str(_search)))
             _check = False
@@ -849,13 +1052,31 @@ def getBestProductText(list_result,_product,products):
                 if _result_text.find(p)>0 and not (is_similar(_product,p,80) or p.find(_product)>=0 or _product.find(p)>=0):
                     logger.debug("result%d error product scoss %s"%(i,p))
                     _check = False
-        if len(_result_text)<50:
+        if len(_result_text)<100:
             if re.search(meter_pattern,_result_text) is None:
                 logger.debug("result%d error text min count"%(i))
                 _check = False
         if len(_result_text)>5000:
-            logger.debug("result%d error text max count"%(i))
+            if len(_result_text)>10000:
+                logger.debug("result%d error text max count"%(i))
+                _check = False
+            elif len(re.findall(meter_pattern,_result_text))<10:
+                logger.debug("result%d error text max count less meter"%(i))
+                _check = False
+
+        list_find = list(set(re.findall(meter_pattern,_result_text)))
+
+        not_list_find = list(set(re.findall(not_meter_pattern,_result_text)))
+        _count = len(list_find)-len(not_list_find)
+        has_num = False
+        for _find in list_find:
+            if re.search('[0-9a-zA-Z]',_find) is not None:
+                has_num = True
+                break
+        if not(_count>=2 and has_num or _count>=5):
+            logger.debug("result%d error match not enough"%(i))
             _check = False
+
         if _check:
             return _result
 
@@ -868,6 +1089,11 @@ def extract_product_parameters(list_data,_product):
     _product = get_correct_product(_product,products)
     logger.debug("all products %s-%s"%(_product,str(products)))
     is_project = False
+    if re.search("项目名称|采购项目",_product) is not None:
+        is_project = True
+        
+    if len(products)==1 and is_similar(products[0],_product,90):
+        is_project = True
     _find_count = 0
     for _data_i in range(len(list_data)):
         _data = list_data[_data_i]
@@ -876,84 +1102,23 @@ def extract_product_parameters(list_data,_product):
         if _type=="sentence":
             if _text.find(_product)>=0:
                 _find_count += 1
-                if re.search("项目名称|采购项目",_text) is not None:
-                   is_project = True
-                if re.search("项目名称|采购项目",_product) is not None:
+                if re.search("项目名称|采购项目",_text) is not None and re.search("等",_text) is not None:
                     is_project = True
-                parent_title = _data.get("parent_title")
-                parent_text = ""
-                parent_parent_title = None
-                parent_parent_text = ""
-                parent_title_index = None
-                parent_parent_title_index = None
-                childs = get_childs([_data])
-
-
-                child_find = False
-                for c in childs:
-                    if re.search(_param_pattern,c["text"]) is not None and len(c["text"])<30:
-                        child_find = True
-                        break
-
-                extract_text,_,_ = get_childs_text([_data],_product,products)
-                logger.debug("childs found extract_text %s"%extract_text)
-                if child_find:
-                    if len(extract_text)>0:
-                        list_result.append(extract_text)
-                else:
-                    if len(_text)<len(_product)+10 and _data["sentence_title"] is not None:
-                        if re.search(meter_pattern,extract_text) is not None:
-                            list_result.append(extract_text)
-
-                if parent_title is not None:
-                    parent_text = parent_title.get("text","")
-                    parent_parent_title = parent_title.get("parent_title")
-                    parent_title_index = parent_title["title_index"]
-                    if parent_parent_title is not None:
-                        parent_parent_text = parent_parent_title.get("text","")
-                        parent_parent_title_index = parent_parent_title["title_index"]
-
-                _suit = False
-                if re.search(_param_pattern,_text) is not None and len(_text)<50:
-                    _suit = True
-                if re.search(_param_pattern,parent_text) is not None and len(parent_text)<50:
-                    _suit = True
-                if re.search(_param_pattern,parent_parent_text) is not None and len(parent_parent_text)<50:
-                    _suit = True
-                if _suit:
-                    logger.debug("extract_type sentence %s"%("extract_parameters_by_tree"))
-                    if not extract_parameters_by_tree(_product,products,list_data,_data_i,parent_title,list_result):
-                        logger.debug("extract_type sentence %s"%("extract_parameters_by_tree"))
-                        extract_parameters_by_tree(_product,products,list_data,_data_i,parent_parent_title,list_result)
-
-
-            if re.search(_param_pattern,_text) is not None and len(_text)<50:
-                childs = _data["child_title"]
-                if len(childs)>0:
-                    logger.debug("extract_type sentence %s"%("re.search(_param_pattern,_text) is not None and len(_text)<50:"))
-                    extract_text,_,_ = get_childs_text([_data],_product,products)
-                    if len(extract_text)>0:
-                        list_result.append(extract_text)
-                    elif is_project:
-                        logger.debug("extract_type sentence is_project")
-                        extract_text,_,_ = get_childs_text([_data],_product,products,is_begin=True)
-                        if len(extract_text)>0 and re.search(meter_pattern,extract_text) is not None:
-                            list_result.append(extract_text)
-
+            extract_parameters_by_sentence(list_data,_data,_data_i,_product,products,list_result,is_project)
 
         elif _type=="table":
             if _text.find(_product)>=0:
                 _find_count += 1
             extract_parameters_by_table(_product,products,_param_pattern,list_data,_data_i,list_result)
 
-
-    return getBestProductText(list_result,_product,products),_find_count
+    _text = getBestProductText(list_result,_product,products)
+    return _text,_find_count
 
 
 if __name__ == '__main__':
 
-    filepath = "download/8679fef3a6fff56abcbdaccb1a190c80.html"
-    _product = "移液器"
+    filepath = "download/4597dcc128bfabc7584d10590ae50656.html"
+    _product = "彩色多普勒超声诊断仪"
 
     _html = open(filepath, "r", encoding="utf8").read()
 
@@ -965,5 +1130,6 @@ if __name__ == '__main__':
 
     _text,_count = extract_product_parameters(list_data,_product)
     logger.info("find count:%d"%(_count))
-    logger.info("extract_text %s"%_text)
+    logger.info("extract_parameter_text::%s"%(_text))
+
 

+ 1 - 2
BaseDataMaintenance/maintenance/product/productUtils.py

@@ -203,7 +203,6 @@ def jaccard_score(source,target):
     return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
 
 
-from fuzzywuzzy import fuzz
 def is_similar(source,target,_radio=None):
     source = str(source).lower()
     target = str(target).lower()
@@ -227,7 +226,7 @@ def is_similar(source,target,_radio=None):
     if min_len<2:
         return False
     #判断相似度
-    similar = fuzz.ratio(source,target)
+    similar = Levenshtein.ratio(source,target)*100
     if similar>=min_ratio:
         log("%s and %s similar_jaro %d"%(source,target,similar))
         return True

+ 0 - 247
BaseDataMaintenance/maintenance/product/product_attachment.py

@@ -1,247 +0,0 @@
-
-
-
-from apscheduler.schedulers.blocking import BlockingScheduler
-from tablestore import *
-from BaseDataMaintenance.dataSource.source import getConnect_ots,getAuth,is_internal
-from BaseDataMaintenance.dataSource.interface import *
-from multiprocessing import Queue as PQueue,Process
-from BaseDataMaintenance.model.ots.document_product import *
-from BaseDataMaintenance.model.ots.attachment import *
-from BaseDataMaintenance.common.Utils import *
-from BaseDataMaintenance.common.ossUtils import *
-from BaseDataMaintenance.maintenance.product.htmlparser import *
-import oss2
-from BaseDataMaintenance.common.multiThread import MultiThreadHandler
-
-parameter_status_no_bidfile = -1
-parameter_status_to_process = 0
-parameter_status_process_succeed = 1
-parameter_status_process_failed = 2
-parameter_status_process_jump = 3
-parameter_status_not_found = 4
-
-class Product_Attachment_Processor():
-
-    def __init__(self,):
-        self.ots_client = getConnect_ots()
-        self.product_attachment_queue = PQueue()
-        self.product_attachment_queue_size = 100
-        self.set_product_attachment = set()
-        self.attachment_hub_url = "https://attachment-hub.oss-cn-hangzhou.aliyuncs.com/"
-        self.auth = getAuth()
-        oss2.defaults.connection_pool_size = 100
-        oss2.defaults.multiget_num_threads = 20
-        if is_internal:
-            self.bucket_url = "http://oss-cn-hangzhou-internal.aliyuncs.com"
-        else:
-            self.bucket_url = "http://oss-cn-hangzhou.aliyuncs.com"
-        log("bucket_url:%s"%(self.bucket_url))
-        self.attachment_bucket_name = "attachment-hub"
-        self.bucket = oss2.Bucket(self.auth,self.bucket_url,self.attachment_bucket_name)
-        self.current_path = os.path.dirname(__file__)
-        self.download_path = "%s/%s"%(self.current_path,"download")
-
-    def process_parameters_producer(self,):
-
-        if self.product_attachment_queue.qsize()>self.product_attachment_queue_size/3:
-            return
-        bool_query = BoolQuery(must_queries=[
-            TermQuery("parameter_status",parameter_status_to_process)
-        ])
-        list_id = []
-        rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
-                                                                            SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("parameter_status")]),limit=100,get_total_count=True),
-                                                                            ColumnsToGet([DOCUMENT_PRODUCT_BID_FILEMD5S,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME],return_type=ColumnReturnType.SPECIFIED))
-
-        list_data = getRow_ots(rows)
-        for data in list_data:
-            _id = data.get(DOCUMENT_PRODUCT_ID)
-            if _id in self.set_product_attachment:
-                continue
-            self.product_attachment_queue.put(data)
-            list_id.append(_id)
-        while next_token:
-            if self.product_attachment_queue.qsize()>=self.product_attachment_queue_size:
-                break
-            rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
-                                                                                SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
-                                                                                ColumnsToGet([DOCUMENT_PRODUCT_BID_FILEMD5S,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME],return_type=ColumnReturnType.SPECIFIED))
-            list_data = getRow_ots(rows)
-            for data in list_data:
-                _id = data.get(DOCUMENT_PRODUCT_ID)
-                if _id in self.set_product_attachment:
-                    continue
-                self.product_attachment_queue.put(data)
-                list_id.append(_id)
-        self.set_product_attachment =  set(list_id)
-
-    def process_parameters_handler(self,item,result_queue):
-        bid_filemd5s = item.get(DOCUMENT_PRODUCT_BID_FILEMD5S)
-        product_name = item.get(DOCUMENT_PRODUCT_NAME)
-        product_original_name = item.get(DOCUMENT_PRODUCT_ORIGINAL_NAME)
-        list_product = []
-        if product_name is not None:
-            list_product.append(product_name)
-        if product_original_name is not None:
-            list_product.extend(product_original_name.split("_"))
-        list_product = list(set(list_product))
-        dp = Document_product(item)
-        if bid_filemd5s is None or bid_filemd5s=="" or len(list_product)==0:
-            dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_no_bidfile)
-            dp.update_row(self.ots_client)
-            return
-        list_filemd5 = bid_filemd5s.split(",")
-        _find = False
-        _success = False
-        for _filemd5 in list_filemd5:
-            if _find:
-                break
-            atta = attachment({attachment_filemd5:_filemd5})
-            if atta.fix_columns(self.ots_client,[attachment_path,attachment_filetype],True):
-                objectPath = atta.getProperties().get(attachment_path)
-                _filetype = atta.getProperties().get(attachment_filetype)
-                if _filetype in ("doc","xls"):
-                    if len(list_filemd5)==1:
-                        dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_jump,True)
-                        dp.update_row(self.ots_client)
-                        return
-                    else:
-                        continue
-                localpath = "%s/%s.%s"%(self.download_path,_filemd5,_filetype)
-                localhtml = "%s/%s.%s"%(self.download_path,_filemd5,"html")
-                download_succeed = False
-                try:
-                    if not os.path.exists(localpath):
-                        download_succeed = downloadFile(self.bucket,objectPath,localpath)
-                    else:
-                        download_succeed = True
-                except Exception as e:
-                    download_succeed = False
-                if download_succeed:
-                    try:
-                        _html = ""
-                        if os.path.exists(localhtml):
-                            _html = open(localhtml,"r",encoding="utf8").read()
-                            _success = True
-                        if len(_html)>10:
-                            _success = True
-                        else:
-                            _data_base64 = base64.b64encode(open(localpath,"rb").read())
-                            _success,_html,swf_images,classification = getAttachDealInterface(_data_base64,_filetype,url="http://192.168.2.102:15011/convert",kwargs={'page_no': '1,-1',"max_bytes":"-1"},timeout=6000)
-                            if _success:
-                                localhtml = "%s/%s.%s"%(self.download_path,_filemd5,"html")
-                                with open(localhtml,"w",encoding="utf8") as f:
-                                    f.write(_html)
-                        if _success:
-                            if len(_html)>5:
-                                pd = ParseDocument(_html,True)
-
-                                list_text = []
-                                for _product in list_product:
-                                    pd.fix_tree(_product)
-                                    list_data = pd.tree
-                                    _text,_count = extract_product_parameters(list_data,_product)
-                                    if _count>0:
-                                        _find = True
-                                    if _text is not None:
-                                        list_text.append(_text)
-                                pd = ParseDocument(_html,False)
-
-                                list_text = []
-                                for _product in list_product:
-                                    pd.fix_tree(_product)
-                                    list_data = pd.tree
-                                    _text,_count = extract_product_parameters(list_data,_product)
-                                    if _count>0:
-                                        _find = True
-                                    if _text is not None:
-                                        list_text.append(_text)
-                                if len(list_text)>0:
-                                    list_text.sort(key=lambda x:len(re.findall('[::;;]',BeautifulSoup(x,"html5lib").get_text())), reverse=True)
-                                    _text = list_text[0]
-                                    _success = True
-                                    dp.setValue(DOCUMENT_PRODUCT_PARAMETER,_text,True)
-                                    dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_succeed,True)
-                                    dp.update_row(self.ots_client)
-                                    return
-                            else:
-                                log("product attachment process filemd5 %s has no content"%(_filemd5))
-                    except Exception as e:
-                        traceback.print_exc()
-                    finally:
-                        try:
-                            # if os.path.exists(localpath):
-                            #     os.remove(localpath)
-                            pass
-                        except Exception as e:
-                            pass
-
-        if not _find:
-            dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_not_found,True)
-            dp.update_row(self.ots_client)
-        else:
-            dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_failed,True)
-            dp.update_row(self.ots_client)
-
-    def start_process(self):
-        mt = MultiThreadHandler(self.product_attachment_queue,self.process_parameters_handler,None,3,need_stop=False,restart=True)
-        mt.run()
-
-    def process_parameters_comsumer(self,):
-
-        # process_count = 2
-        # list_process = []
-        # for i in range(process_count):
-        #     p = Process(target=self.start_process)
-        #     list_process.append(p)
-        # for p in list_process:
-        #     p.start()
-        # for p in list_process:
-        #     p.join()
-
-        self.start_process()
-
-    def start_process_parameters(self):
-        scheduler = BlockingScheduler()
-        scheduler.add_job(self.process_parameters_producer,"cron",second="*/10")
-        scheduler.add_job(self.process_parameters_comsumer,"cron",second="*/30")
-        scheduler.start()
-
-def start_process_parameters():
-    pap = Product_Attachment_Processor()
-    pap.start_process_parameters()
-
-def change_parameters_status():
-    ots_client =getConnect_ots()
-    bool_query = BoolQuery(must_queries=[
-        RangeQuery("parameter_status",-1)
-    ],
-                           must_not_queries=[
-        TermQuery("parameter_status",parameter_status_to_process),
-        TermQuery("parameter_status",parameter_status_process_succeed),
-        TermQuery("parameter_status",parameter_status_process_jump),
-        # TermQuery("parameter_status",parameter_status_no_bidfile),
-
-    ])
-    list_data = []
-    rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
-                                                                        SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("parameter_status")]),limit=100,get_total_count=True),
-                                                                        ColumnsToGet([DOCUMENT_PRODUCT_BID_FILEMD5S,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME],return_type=ColumnReturnType.SPECIFIED))
-
-    list_data.extend(getRow_ots(rows))
-    print("total_count",total_count)
-    while next_token:
-        rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
-                                                                            SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
-                                                                            ColumnsToGet([DOCUMENT_PRODUCT_BID_FILEMD5S],return_type=ColumnReturnType.SPECIFIED))
-        list_data.extend(getRow_ots(rows))
-    for data in list_data:
-        dp = Document_product(data)
-        dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_to_process,True)
-        dp.setValue(DOCUMENT_PRODUCT_PARAMETER,"",True)
-        dp.update_row(ots_client)
-
-if __name__ == '__main__':
-    start_process_parameters()
-    # change_parameters_status()

+ 322 - 0
BaseDataMaintenance/maintenance/product/product_parameter.py

@@ -0,0 +1,322 @@
+
+
+
+from apscheduler.schedulers.blocking import BlockingScheduler
+from tablestore import *
+from BaseDataMaintenance.dataSource.source import getConnect_ots,getAuth,is_internal
+from BaseDataMaintenance.dataSource.interface import *
+from multiprocessing import Queue as PQueue
+from multiprocessing import Process
+from BaseDataMaintenance.model.ots.document_product import *
+from BaseDataMaintenance.model.ots.attachment import *
+from BaseDataMaintenance.common.Utils import *
+from BaseDataMaintenance.common.ossUtils import *
+from BaseDataMaintenance.maintenance.product.htmlparser import *
+from BaseDataMaintenance.maintenance.product.productUtils import pool_product
+import oss2
+from BaseDataMaintenance.common.multiThread import MultiThreadHandler
+
+parameter_status_no_bidfile = -1
+parameter_status_to_process = 0
+parameter_status_process_succeed = 1
+parameter_status_process_failed = 2
+parameter_status_process_jump = 3
+parameter_status_not_found = 4
+
+import redis
+
+from BaseDataMaintenance.java.MQInfo import getAllQueueSize,getQueueSize
+
+class Product_Attachment_Processor():
+
+    def __init__(self,):
+        self.ots_client = getConnect_ots()
+        self.product_attachment_queue = PQueue()
+        self.product_attachment_queue_size = 50
+        self.set_product_attachment = set()
+        self.attachment_hub_url = "https://attachment-hub.oss-cn-hangzhou.aliyuncs.com/"
+        self.auth = getAuth()
+        oss2.defaults.connection_pool_size = 100
+        oss2.defaults.multiget_num_threads = 20
+        if is_internal:
+            self.bucket_url = "http://oss-cn-hangzhou-internal.aliyuncs.com"
+        else:
+            self.bucket_url = "http://oss-cn-hangzhou.aliyuncs.com"
+        log("bucket_url:%s"%(self.bucket_url))
+        self.attachment_bucket_name = "attachment-hub"
+        self.bucket = oss2.Bucket(self.auth,self.bucket_url,self.attachment_bucket_name)
+        self.current_path = os.path.dirname(__file__)
+        self.download_path = "%s/%s"%(self.current_path,"download")
+        self.test_url="http://192.168.2.102:15011/convert"
+
+    def process_parameters_producer(self,):
+        attachment_size = getQueueSize("dataflow_attachment")
+        if attachment_size<100:
+
+            _qsize = self.product_attachment_queue.qsize()
+            log("product_attachment_queue %d"%(_qsize))
+            if _qsize>self.product_attachment_queue_size/3:
+                return
+            bool_query = BoolQuery(must_queries=[
+                TermQuery("parameter_status",parameter_status_to_process)
+            ])
+            list_id = []
+            dict_docid_list = {}
+            rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
+                                                                                ColumnsToGet([DOCUMENT_PRODUCT_ATTACHMENTS,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME,DOCUMENT_PRODUCT_DOCID],return_type=ColumnReturnType.SPECIFIED))
+
+            list_data = getRow_ots(rows)
+            _count = 0
+            for data in list_data:
+                _id = data.get(DOCUMENT_PRODUCT_ID)
+                list_id.append(_id)
+                if _id in self.set_product_attachment:
+                    continue
+                docid = data.get(DOCUMENT_PRODUCT_DOCID)
+                if docid not in dict_docid_list:
+                    dict_docid_list[docid] = []
+                dict_docid_list[docid].append(data)
+
+                _count += 1
+            while next_token:
+                if len(dict_docid_list.keys())>=self.product_attachment_queue_size:
+                    break
+                rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                    SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                                    ColumnsToGet([DOCUMENT_PRODUCT_ATTACHMENTS,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME,DOCUMENT_PRODUCT_DOCID],return_type=ColumnReturnType.SPECIFIED))
+                list_data = getRow_ots(rows)
+                for data in list_data:
+                    _id = data.get(DOCUMENT_PRODUCT_ID)
+                    list_id.append(_id)
+                    if _id in self.set_product_attachment:
+                        continue
+                    docid = data.get(DOCUMENT_PRODUCT_DOCID)
+                    if docid not in dict_docid_list:
+                        dict_docid_list[docid] = []
+                    dict_docid_list[docid].append(data)
+
+                    _count += 1
+            for k,v in dict_docid_list.items():
+                self.product_attachment_queue.put(v)
+            _qsize = self.product_attachment_queue.qsize()
+            log("after product_attachment_queue %d"%(_qsize))
+            self.set_product_attachment = set(list_id)
+
+    def get_whole_html(self,_filemd5):
+        atta = attachment({attachment_filemd5:_filemd5})
+        _html = ""
+
+        db = redis.Redis(connection_pool=pool_product)
+        _key = "filemd5:%s"%(_filemd5)
+
+        _cache_html = None
+        try:
+            _cache_html = db.get(_key)
+        except Exception as e:
+            logger.info("get redis cache html error")
+        
+        if _cache_html is not None:
+            _html = _cache_html
+        else:
+            if atta.fix_columns(self.ots_client,[attachment_path,attachment_filetype,attachment_size],True):
+                objectPath = atta.getProperties().get(attachment_path)
+                _filetype = atta.getProperties().get(attachment_filetype)
+                _size = atta.getProperties().get(attachment_size,0)
+                if _size<=0 or _size>=20*1024*1024:
+                    return _html
+
+                # not supported on windows
+                # if _filetype in ("doc","xls"):
+                #     if len(list_filemd5)==1:
+                #         dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_jump,True)
+                #         dp.update_row(self.ots_client)
+                #         return
+                #     else:
+                #         continue
+
+                localpath = "%s/%s.%s"%(self.download_path,_filemd5,_filetype)
+                localhtml = "%s/%s.%s"%(self.download_path,_filemd5,"html")
+                download_succeed = False
+                try:
+                    if not os.path.exists(localpath):
+                        download_succeed = downloadFile(self.bucket,objectPath,localpath)
+                    else:
+                        download_succeed = True
+                except Exception as e:
+                    download_succeed = False
+                if download_succeed:
+                    try:
+                        start_time = time.time()
+                        if os.path.exists(localhtml):
+                            _html = open(localhtml,"r",encoding="utf8").read()
+                            _success = True
+                        if len(_html)>10:
+                            _success = True
+                        else:
+                            _data_base64 = base64.b64encode(open(localpath,"rb").read())
+
+                            _success,_html,swf_images,classification = getAttachDealInterface(_data_base64,_filetype,kwargs={'page_no': '1,-1',"max_bytes":"-1","timeout":6000},timeout=6000)
+
+                            if _success:
+                                db.set(_key,_html,24*60*60)
+                                # save for dubug
+                                # localhtml = "%s/%s.%s"%(self.download_path,_filemd5,"html")
+                                # with open(localhtml,"w",encoding="utf8") as f:
+                                #     f.write(_html)
+
+                    except ConnectionError as e1:
+                        if time.time()-start_time>5000:
+                            db.set(_key,_html,24*60*60)
+                        else:
+                            raise e1
+                    except Exception as e:
+                        traceback.print_exc()
+                    finally:
+                        try:
+                            if os.path.exists(localpath):
+                                os.remove(localpath)
+                            pass
+                        except Exception as e:
+                            pass
+            else:
+                log("attachment %s not exists"%_filemd5)
+        return _html
+
+    def process_parameters_handler(self,list_item,result_queue):
+        for item in list_item:
+            attachments = item.get(DOCUMENT_PRODUCT_ATTACHMENTS)
+            product_name = item.get(DOCUMENT_PRODUCT_NAME)
+            product_original_name = item.get(DOCUMENT_PRODUCT_ORIGINAL_NAME)
+            list_product = []
+            log("processing name:%s original_name:%s attachments:%s"%(product_name,product_original_name,attachments))
+            if product_original_name is not None:
+                _l = product_original_name.split("_")
+                _l.reverse()
+                list_product.extend(_l)
+            if product_name is not None:
+                list_product.append(product_name)
+            list_product = list(set(list_product))
+            dp = Document_product(item)
+            if attachments is None or attachments=="" or len(list_product)==0:
+                dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_no_bidfile)
+                dp.update_row(self.ots_client)
+                return
+            list_attachment = json.loads(attachments)
+            list_attachment.sort(key=lambda x:0 if x.get("classification")=="招标文件" else 1 if x.get("classification")=="采购清单" else 2)
+            list_filemd5 = [a.get("filemd5","") for a in list_attachment]
+            _find = False
+            _success = False
+            list_text = []
+            for _filemd5 in list_filemd5:
+                _html = self.get_whole_html(_filemd5)
+                if len(_html)>5:
+
+                    pd = ParseDocument(_html,True)
+                    for _product in list_product:
+                        pd.fix_tree(_product)
+                        list_data = pd.tree
+                        _text,_count = extract_product_parameters(list_data,_product)
+                        if _count>0:
+                            _find = True
+                        if _text is not None:
+                            list_text.append(_text)
+
+                    pd = ParseDocument(_html,False)
+                    for _product in list_product:
+                        pd.fix_tree(_product)
+                        list_data = pd.tree
+                        _text,_count = extract_product_parameters(list_data,_product)
+                        if _count>0:
+                            _find = True
+                        if _text is not None:
+                            list_text.append(_text)
+                else:
+                    log("product attachment process filemd5 %s has no content"%(_filemd5))
+                if len(list_text)>0:
+                    _text = getBestProductText(list_text,'',[])
+                    logger.info("extract_parameter_text bid_filemd5s:%s name:%s original_name:%s parameter_text:%s"%(str(list_filemd5),product_name,product_original_name,_text))
+                    dp.setValue(DOCUMENT_PRODUCT_PARAMETER,_text,True)
+                    dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_succeed,True)
+                    dp.update_row(self.ots_client)
+                    _success = True
+                    break
+
+            if not _success:
+                if not _find:
+                    dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_not_found,True)
+                    dp.update_row(self.ots_client)
+                else:
+                    dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_failed,True)
+                    dp.update_row(self.ots_client)
+
+    def start_process(self):
+        self.process_parameters_producer()
+        thread_count = 7
+        mt = MultiThreadHandler(self.product_attachment_queue,self.process_parameters_handler,None,thread_count,need_stop=False,restart=True)
+        mt.run()
+
+    def process_parameters_comsumer(self,):
+        # process_count = 3
+        # list_process = []
+        # for i in range(process_count):
+        #     p = Process(target=self.start_process)
+        #     list_process.append(p)
+        # for p in list_process:
+        #     p.start()
+        # for p in list_process:
+        #     p.join()
+        self.start_process()
+
+
+    def start_process_parameters(self):
+        scheduler = BlockingScheduler()
+        scheduler.add_job(self.process_parameters_producer,"cron",second="*/20")
+        scheduler.add_job(self.process_parameters_comsumer,"cron",second="*/30")
+        scheduler.start()
+
+def start_process_parameters():
+    pap = Product_Attachment_Processor()
+    pap.start_process_parameters()
+
+def change_parameters_status():
+    ots_client =getConnect_ots()
+    bool_query = BoolQuery(must_queries=[
+        RangeQuery("parameter_status",-1)
+    ],
+                           must_not_queries=[
+        TermQuery("parameter_status",parameter_status_to_process),
+        TermQuery("parameter_status",parameter_status_process_succeed),
+        TermQuery("parameter_status",parameter_status_process_jump),
+        TermQuery("parameter_status",parameter_status_no_bidfile),
+        TermQuery("parameter_status",parameter_status_not_found),
+
+    ])
+    list_data = []
+    rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                        SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("parameter_status")]),limit=100,get_total_count=True),
+                                                                        ColumnsToGet([DOCUMENT_PRODUCT_ID],return_type=ColumnReturnType.SPECIFIED))
+
+    list_data.extend(getRow_ots(rows))
+    print("total_count",total_count)
+    while next_token:
+        rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                            SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                            ColumnsToGet([DOCUMENT_PRODUCT_ID],return_type=ColumnReturnType.SPECIFIED))
+        list_data.extend(getRow_ots(rows))
+    from queue import Queue
+    task_queue = Queue()
+    for data in list_data:
+        task_queue.put(data)
+
+    def _handle(data,result_queue):
+        dp = Document_product(data)
+        dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_to_process,True)
+        dp.setValue(DOCUMENT_PRODUCT_PARAMETER,"",True)
+        dp.update_row(ots_client)
+    mt = MultiThreadHandler(task_queue,_handle,None,30)
+    mt.run()
+
+if __name__ == '__main__':
+    start_process_parameters()
+    # change_parameters_status()

Plik diff jest za duży
+ 1 - 1
BaseDataMaintenance/maxcompute/documentDumplicate.py


+ 11 - 11
BaseDataMaintenance/model/ots/document.py

@@ -321,11 +321,15 @@ def turn_document_status():
         #     # must_not_queries=[WildcardQuery("DX004354*")]
         # )
         bool_query = BoolQuery(
-            must_queries=[
-                RangeQuery("crtime","2023-08-30 15:00:00","2023-08-30 23:59:59"),
-                NestedQuery("page_attachments",ExistsQuery("page_attachments.fileMd5"))
-            ],
-            must_not_queries=[WildcardQuery("attachmenttextcon","*")]
+            # must_queries=[
+            #     RangeQuery("crtime","2023-08-30 15:00:00","2023-08-30 23:59:59"),
+            #     NestedQuery("page_attachments",ExistsQuery("page_attachments.fileMd5"))
+            # ],
+            # must_not_queries=[WildcardQuery("attachmenttextcon","*")],
+            should_queries=[
+                NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer","个体工商户")),
+                NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer","机械设备")),
+            ]
 
         )
 
@@ -337,9 +341,7 @@ def turn_document_status():
         _count = len(list_data)
         for _data in list_data:
             _document = Document(_data)
-            _attachment = _data.get(document_attachmenttextcon,"")
-            if _attachment=="":
-                task_queue.put(_document)
+            task_queue.put(_document)
         while next_token:
             rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                            SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
@@ -349,9 +351,7 @@ def turn_document_status():
             print("%d/%d"%(_count,total_count))
             for _data in list_data:
                 _document = Document(_data)
-                _attachment = _data.get(document_attachmenttextcon,"")
-                if _attachment=="":
-                    task_queue.put(_document)
+                task_queue.put(_document)
 
         # docids = [223820830,224445409]
         # for docid in docids:

Plik diff jest za duży
+ 27 - 0
BaseDataMaintenance/model/ots/document_html.py


+ 4 - 0
BaseDataMaintenance/start_product.py

@@ -11,6 +11,7 @@ def main(args=None):
     parser.add_argument("--search_similar",dest="search_similar",action="store_true",help="start product_dict_synchonize process")
     parser.add_argument("--start_process_product",dest="start_process_product",action="store_true",help="start product_dict_synchonize process")
     parser.add_argument("--test",dest="test",action="store_true",help="start product_dict_synchonize process")
+    parser.add_argument("--start_extract_parameter",dest="start_extract_parameter",action="store_true",help="start extract_parameter")
 
     args = parser.parse_args(args)
     if args.product_dict_synchonize:
@@ -28,6 +29,9 @@ def main(args=None):
     if args.test:
         from BaseDataMaintenance.maintenance.product.products import test
         test()
+    if args.start_extract_parameter:
+        from BaseDataMaintenance.maintenance.product.product_parameter import start_process_parameters
+        start_process_parameters()
 
 
 if __name__ == '__main__':

Niektóre pliki nie zostały wyświetlone z powodu dużej ilości zmienionych plików