|
@@ -24,12 +24,14 @@ parameter_status_not_found = 4
|
|
|
|
|
|
import redis
|
|
import redis
|
|
|
|
|
|
|
|
+from BaseDataMaintenance.java.MQInfo import getAllQueueSize,getQueueSize
|
|
|
|
+
|
|
class Product_Attachment_Processor():
|
|
class Product_Attachment_Processor():
|
|
|
|
|
|
def __init__(self,):
|
|
def __init__(self,):
|
|
self.ots_client = getConnect_ots()
|
|
self.ots_client = getConnect_ots()
|
|
self.product_attachment_queue = PQueue()
|
|
self.product_attachment_queue = PQueue()
|
|
- self.product_attachment_queue_size = 100
|
|
|
|
|
|
+ self.product_attachment_queue_size = 50
|
|
self.set_product_attachment = set()
|
|
self.set_product_attachment = set()
|
|
self.attachment_hub_url = "https://attachment-hub.oss-cn-hangzhou.aliyuncs.com/"
|
|
self.attachment_hub_url = "https://attachment-hub.oss-cn-hangzhou.aliyuncs.com/"
|
|
self.auth = getAuth()
|
|
self.auth = getAuth()
|
|
@@ -47,38 +49,53 @@ class Product_Attachment_Processor():
|
|
self.test_url="http://192.168.2.102:15011/convert"
|
|
self.test_url="http://192.168.2.102:15011/convert"
|
|
|
|
|
|
def process_parameters_producer(self,):
|
|
def process_parameters_producer(self,):
|
|
-
|
|
|
|
- if self.product_attachment_queue.qsize()>self.product_attachment_queue_size/3:
|
|
|
|
- return
|
|
|
|
- bool_query = BoolQuery(must_queries=[
|
|
|
|
- TermQuery("parameter_status",parameter_status_to_process)
|
|
|
|
- ])
|
|
|
|
- list_id = []
|
|
|
|
- rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
|
|
|
|
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("parameter_status")]),limit=100,get_total_count=True),
|
|
|
|
- ColumnsToGet([DOCUMENT_PRODUCT_ATTACHMENTS,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME],return_type=ColumnReturnType.SPECIFIED))
|
|
|
|
-
|
|
|
|
- list_data = getRow_ots(rows)
|
|
|
|
- for data in list_data:
|
|
|
|
- _id = data.get(DOCUMENT_PRODUCT_ID)
|
|
|
|
- if _id in self.set_product_attachment:
|
|
|
|
- continue
|
|
|
|
- self.product_attachment_queue.put(data)
|
|
|
|
- list_id.append(_id)
|
|
|
|
- while next_token:
|
|
|
|
- if self.product_attachment_queue.qsize()>=self.product_attachment_queue_size:
|
|
|
|
- break
|
|
|
|
|
|
+ attachment_size = getQueueSize("dataflow_attachment")
|
|
|
|
+ if attachment_size<100:
|
|
|
|
+
|
|
|
|
+ if self.product_attachment_queue.qsize()>self.product_attachment_queue_size/3:
|
|
|
|
+ return
|
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
|
+ TermQuery("parameter_status",parameter_status_to_process)
|
|
|
|
+ ])
|
|
|
|
+ list_id = []
|
|
|
|
+ dict_docid_list = {}
|
|
rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
|
|
rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
|
|
- SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
|
|
|
|
- ColumnsToGet([DOCUMENT_PRODUCT_ATTACHMENTS,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME],return_type=ColumnReturnType.SPECIFIED))
|
|
|
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
|
|
|
|
+ ColumnsToGet([DOCUMENT_PRODUCT_ATTACHMENTS,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME,DOCUMENT_PRODUCT_DOCID],return_type=ColumnReturnType.SPECIFIED))
|
|
|
|
+
|
|
list_data = getRow_ots(rows)
|
|
list_data = getRow_ots(rows)
|
|
|
|
+ _count = 0
|
|
for data in list_data:
|
|
for data in list_data:
|
|
_id = data.get(DOCUMENT_PRODUCT_ID)
|
|
_id = data.get(DOCUMENT_PRODUCT_ID)
|
|
if _id in self.set_product_attachment:
|
|
if _id in self.set_product_attachment:
|
|
continue
|
|
continue
|
|
- self.product_attachment_queue.put(data)
|
|
|
|
|
|
+ docid = data.get(DOCUMENT_PRODUCT_DOCID)
|
|
|
|
+ if docid not in dict_docid_list:
|
|
|
|
+ dict_docid_list[docid] = []
|
|
|
|
+ dict_docid_list[docid].append(data)
|
|
|
|
+
|
|
list_id.append(_id)
|
|
list_id.append(_id)
|
|
- self.set_product_attachment = set(list_id)
|
|
|
|
|
|
+ _count += 1
|
|
|
|
+ while next_token:
|
|
|
|
+ if len(dict_docid_list.keys())>=self.product_attachment_queue_size:
|
|
|
|
+ break
|
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
|
|
|
|
+ SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
|
|
|
|
+ ColumnsToGet([DOCUMENT_PRODUCT_ATTACHMENTS,DOCUMENT_PRODUCT_NAME,DOCUMENT_PRODUCT_ORIGINAL_NAME,DOCUMENT_PRODUCT_DOCID],return_type=ColumnReturnType.SPECIFIED))
|
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
|
+ for data in list_data:
|
|
|
|
+ _id = data.get(DOCUMENT_PRODUCT_ID)
|
|
|
|
+ if _id in self.set_product_attachment:
|
|
|
|
+ continue
|
|
|
|
+ docid = data.get(DOCUMENT_PRODUCT_DOCID)
|
|
|
|
+ if docid not in dict_docid_list:
|
|
|
|
+ dict_docid_list[docid] = []
|
|
|
|
+ dict_docid_list[docid].append(data)
|
|
|
|
+ list_id.append(_id)
|
|
|
|
+ _count += 1
|
|
|
|
+ for k,v in dict_docid_list.items():
|
|
|
|
+ self.product_attachment_queue.put(v)
|
|
|
|
+ self.set_product_attachment = set(list_id)
|
|
|
|
|
|
def get_whole_html(self,_filemd5):
|
|
def get_whole_html(self,_filemd5):
|
|
atta = attachment({attachment_filemd5:_filemd5})
|
|
atta = attachment({attachment_filemd5:_filemd5})
|
|
@@ -96,9 +113,12 @@ class Product_Attachment_Processor():
|
|
if _cache_html is not None:
|
|
if _cache_html is not None:
|
|
_html = _cache_html
|
|
_html = _cache_html
|
|
else:
|
|
else:
|
|
- if atta.fix_columns(self.ots_client,[attachment_path,attachment_filetype],True):
|
|
|
|
|
|
+ if atta.fix_columns(self.ots_client,[attachment_path,attachment_filetype,attachment_size],True):
|
|
objectPath = atta.getProperties().get(attachment_path)
|
|
objectPath = atta.getProperties().get(attachment_path)
|
|
_filetype = atta.getProperties().get(attachment_filetype)
|
|
_filetype = atta.getProperties().get(attachment_filetype)
|
|
|
|
+ _size = atta.getProperties().get(attachment_size,0)
|
|
|
|
+ if _size<=0 or _size>=20*1024*1024:
|
|
|
|
+ return _html
|
|
|
|
|
|
# not supported on windows
|
|
# not supported on windows
|
|
# if _filetype in ("doc","xls"):
|
|
# if _filetype in ("doc","xls"):
|
|
@@ -121,7 +141,7 @@ class Product_Attachment_Processor():
|
|
download_succeed = False
|
|
download_succeed = False
|
|
if download_succeed:
|
|
if download_succeed:
|
|
try:
|
|
try:
|
|
-
|
|
|
|
|
|
+ start_time = time.time()
|
|
if os.path.exists(localhtml):
|
|
if os.path.exists(localhtml):
|
|
_html = open(localhtml,"r",encoding="utf8").read()
|
|
_html = open(localhtml,"r",encoding="utf8").read()
|
|
_success = True
|
|
_success = True
|
|
@@ -129,7 +149,8 @@ class Product_Attachment_Processor():
|
|
_success = True
|
|
_success = True
|
|
else:
|
|
else:
|
|
_data_base64 = base64.b64encode(open(localpath,"rb").read())
|
|
_data_base64 = base64.b64encode(open(localpath,"rb").read())
|
|
- _success,_html,swf_images,classification = getAttachDealInterface(_data_base64,_filetype,kwargs={'page_no': '1,-1',"max_bytes":"-1"},timeout=6000)
|
|
|
|
|
|
+
|
|
|
|
+ _success,_html,swf_images,classification = getAttachDealInterface(_data_base64,_filetype,kwargs={'page_no': '1,-1',"max_bytes":"-1","timeout":6000},timeout=6000)
|
|
|
|
|
|
if _success:
|
|
if _success:
|
|
db.set(_key,_html,24*60*60)
|
|
db.set(_key,_html,24*60*60)
|
|
@@ -138,6 +159,11 @@ class Product_Attachment_Processor():
|
|
# with open(localhtml,"w",encoding="utf8") as f:
|
|
# with open(localhtml,"w",encoding="utf8") as f:
|
|
# f.write(_html)
|
|
# f.write(_html)
|
|
|
|
|
|
|
|
+ except ConnectionError as e1:
|
|
|
|
+ if time.time()-start_time>5000:
|
|
|
|
+ db.set(_key,_html,24*60*60)
|
|
|
|
+ else:
|
|
|
|
+ raise e1
|
|
except Exception as e:
|
|
except Exception as e:
|
|
traceback.print_exc()
|
|
traceback.print_exc()
|
|
finally:
|
|
finally:
|
|
@@ -147,89 +173,99 @@ class Product_Attachment_Processor():
|
|
pass
|
|
pass
|
|
except Exception as e:
|
|
except Exception as e:
|
|
pass
|
|
pass
|
|
|
|
+ else:
|
|
|
|
+ log("attachment %s not exists"%_filemd5)
|
|
return _html
|
|
return _html
|
|
|
|
|
|
- def process_parameters_handler(self,item,result_queue):
|
|
|
|
- attachments = item.get(DOCUMENT_PRODUCT_ATTACHMENTS)
|
|
|
|
- product_name = item.get(DOCUMENT_PRODUCT_NAME)
|
|
|
|
- product_original_name = item.get(DOCUMENT_PRODUCT_ORIGINAL_NAME)
|
|
|
|
- list_product = []
|
|
|
|
- if product_original_name is not None:
|
|
|
|
- _l = product_original_name.split("_")
|
|
|
|
- _l.reverse()
|
|
|
|
- list_product.extend(_l)
|
|
|
|
- if product_name is not None:
|
|
|
|
- list_product.append(product_name)
|
|
|
|
- list_product = list(set(list_product))
|
|
|
|
- dp = Document_product(item)
|
|
|
|
- if attachments is None or attachments=="" or len(list_product)==0:
|
|
|
|
- dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_no_bidfile)
|
|
|
|
- dp.update_row(self.ots_client)
|
|
|
|
- return
|
|
|
|
- list_attachment = json.loads(attachments)
|
|
|
|
- list_filemd5 = [a.get("filemd5","") for a in list_attachment]
|
|
|
|
- _find = False
|
|
|
|
- _success = False
|
|
|
|
- list_text = []
|
|
|
|
- for _filemd5 in list_filemd5:
|
|
|
|
- _html = self.get_whole_html(_filemd5)
|
|
|
|
- if len(_html)>5:
|
|
|
|
-
|
|
|
|
- pd = ParseDocument(_html,True)
|
|
|
|
- for _product in list_product:
|
|
|
|
- pd.fix_tree(_product)
|
|
|
|
- list_data = pd.tree
|
|
|
|
- _text,_count = extract_product_parameters(list_data,_product)
|
|
|
|
- if _count>0:
|
|
|
|
- _find = True
|
|
|
|
- if _text is not None:
|
|
|
|
- list_text.append(_text)
|
|
|
|
-
|
|
|
|
- pd = ParseDocument(_html,False)
|
|
|
|
- for _product in list_product:
|
|
|
|
- pd.fix_tree(_product)
|
|
|
|
- list_data = pd.tree
|
|
|
|
- _text,_count = extract_product_parameters(list_data,_product)
|
|
|
|
- if _count>0:
|
|
|
|
- _find = True
|
|
|
|
- if _text is not None:
|
|
|
|
- list_text.append(_text)
|
|
|
|
- else:
|
|
|
|
- log("product attachment process filemd5 %s has no content"%(_filemd5))
|
|
|
|
-
|
|
|
|
- if len(list_text)>0:
|
|
|
|
- _text = getBestProductText(list_text,'',[])
|
|
|
|
- logger.info("extract_parameter_text bid_filemd5s:%s name:%s original_name:%s parameter_text:%s"%(str(list_filemd5),product_name,product_original_name,_text))
|
|
|
|
- dp.setValue(DOCUMENT_PRODUCT_PARAMETER,_text,True)
|
|
|
|
- dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_succeed,True)
|
|
|
|
- dp.update_row(self.ots_client)
|
|
|
|
- else:
|
|
|
|
- if not _find:
|
|
|
|
- dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_not_found,True)
|
|
|
|
- dp.update_row(self.ots_client)
|
|
|
|
- else:
|
|
|
|
- dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_failed,True)
|
|
|
|
|
|
+ def process_parameters_handler(self,list_item,result_queue):
|
|
|
|
+ for item in list_item:
|
|
|
|
+ attachments = item.get(DOCUMENT_PRODUCT_ATTACHMENTS)
|
|
|
|
+ product_name = item.get(DOCUMENT_PRODUCT_NAME)
|
|
|
|
+ product_original_name = item.get(DOCUMENT_PRODUCT_ORIGINAL_NAME)
|
|
|
|
+ list_product = []
|
|
|
|
+ log("processing name:%s original_name:%s attachments:%s"%(product_name,product_original_name,attachments))
|
|
|
|
+ if product_original_name is not None:
|
|
|
|
+ _l = product_original_name.split("_")
|
|
|
|
+ _l.reverse()
|
|
|
|
+ list_product.extend(_l)
|
|
|
|
+ if product_name is not None:
|
|
|
|
+ list_product.append(product_name)
|
|
|
|
+ list_product = list(set(list_product))
|
|
|
|
+ dp = Document_product(item)
|
|
|
|
+ if attachments is None or attachments=="" or len(list_product)==0:
|
|
|
|
+ dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_no_bidfile)
|
|
dp.update_row(self.ots_client)
|
|
dp.update_row(self.ots_client)
|
|
|
|
+ return
|
|
|
|
+ list_attachment = json.loads(attachments)
|
|
|
|
+ list_attachment.sort(key=lambda x:0 if x.get("classification")=="招标文件" else 1 if x.get("classification")=="采购清单" else 2)
|
|
|
|
+ list_filemd5 = [a.get("filemd5","") for a in list_attachment]
|
|
|
|
+ _find = False
|
|
|
|
+ _success = False
|
|
|
|
+ list_text = []
|
|
|
|
+ for _filemd5 in list_filemd5:
|
|
|
|
+ _html = self.get_whole_html(_filemd5)
|
|
|
|
+ if len(_html)>5:
|
|
|
|
+
|
|
|
|
+ pd = ParseDocument(_html,True)
|
|
|
|
+ for _product in list_product:
|
|
|
|
+ pd.fix_tree(_product)
|
|
|
|
+ list_data = pd.tree
|
|
|
|
+ _text,_count = extract_product_parameters(list_data,_product)
|
|
|
|
+ if _count>0:
|
|
|
|
+ _find = True
|
|
|
|
+ if _text is not None:
|
|
|
|
+ list_text.append(_text)
|
|
|
|
+
|
|
|
|
+ pd = ParseDocument(_html,False)
|
|
|
|
+ for _product in list_product:
|
|
|
|
+ pd.fix_tree(_product)
|
|
|
|
+ list_data = pd.tree
|
|
|
|
+ _text,_count = extract_product_parameters(list_data,_product)
|
|
|
|
+ if _count>0:
|
|
|
|
+ _find = True
|
|
|
|
+ if _text is not None:
|
|
|
|
+ list_text.append(_text)
|
|
|
|
+ else:
|
|
|
|
+ log("product attachment process filemd5 %s has no content"%(_filemd5))
|
|
|
|
+ if len(list_text)>0:
|
|
|
|
+ _text = getBestProductText(list_text,'',[])
|
|
|
|
+ logger.info("extract_parameter_text bid_filemd5s:%s name:%s original_name:%s parameter_text:%s"%(str(list_filemd5),product_name,product_original_name,_text))
|
|
|
|
+ dp.setValue(DOCUMENT_PRODUCT_PARAMETER,_text,True)
|
|
|
|
+ dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_succeed,True)
|
|
|
|
+ dp.update_row(self.ots_client)
|
|
|
|
+ _success = True
|
|
|
|
+ break
|
|
|
|
+
|
|
|
|
+ if not _success:
|
|
|
|
+ if not _find:
|
|
|
|
+ dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_not_found,True)
|
|
|
|
+ dp.update_row(self.ots_client)
|
|
|
|
+ else:
|
|
|
|
+ dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_failed,True)
|
|
|
|
+ dp.update_row(self.ots_client)
|
|
|
|
|
|
def start_process(self):
|
|
def start_process(self):
|
|
- mt = MultiThreadHandler(self.product_attachment_queue,self.process_parameters_handler,None,2,need_stop=False,restart=True)
|
|
|
|
|
|
+ self.process_parameters_producer()
|
|
|
|
+ thread_count = 7
|
|
|
|
+ mt = MultiThreadHandler(self.product_attachment_queue,self.process_parameters_handler,None,thread_count,need_stop=False,restart=True)
|
|
mt.run()
|
|
mt.run()
|
|
|
|
|
|
def process_parameters_comsumer(self,):
|
|
def process_parameters_comsumer(self,):
|
|
- process_count = 2
|
|
|
|
- list_process = []
|
|
|
|
- for i in range(process_count):
|
|
|
|
- p = Process(target=self.start_process)
|
|
|
|
- list_process.append(p)
|
|
|
|
- for p in list_process:
|
|
|
|
- p.start()
|
|
|
|
- for p in list_process:
|
|
|
|
- p.join()
|
|
|
|
|
|
+ # process_count = 2
|
|
|
|
+ # list_process = []
|
|
|
|
+ # for i in range(process_count):
|
|
|
|
+ # p = Process(target=self.start_process)
|
|
|
|
+ # list_process.append(p)
|
|
|
|
+ # for p in list_process:
|
|
|
|
+ # p.start()
|
|
|
|
+ # for p in list_process:
|
|
|
|
+ # p.join()
|
|
|
|
+ self.start_process()
|
|
|
|
|
|
|
|
|
|
def start_process_parameters(self):
|
|
def start_process_parameters(self):
|
|
scheduler = BlockingScheduler()
|
|
scheduler = BlockingScheduler()
|
|
- scheduler.add_job(self.process_parameters_producer,"cron",second="*/10")
|
|
|
|
|
|
+ scheduler.add_job(self.process_parameters_producer,"cron",second="*/20")
|
|
scheduler.add_job(self.process_parameters_comsumer,"cron",second="*/30")
|
|
scheduler.add_job(self.process_parameters_comsumer,"cron",second="*/30")
|
|
scheduler.start()
|
|
scheduler.start()
|
|
|
|
|