瀏覽代碼

增加原始名称,招标文件附件和修复历史数据

luojiehua 2 年之前
父節點
當前提交
67c4894e99

+ 2 - 2
BaseDataMaintenance/common/multiThread.py

@@ -45,7 +45,7 @@ class _taskHandler(threading.Thread):
     def run(self):
         while(True):
             try:
-                logging.info("handler task queue size is %d need_stop %s"%(self.task_queue.qsize(),str(self.need_stop)))
+                logging.debug("handler task queue size is %d need_stop %s"%(self.task_queue.qsize(),str(self.need_stop)))
                 item = self.task_queue.get(True,timeout=1)
                 self.task_handler(item,self.result_queue,*self.args,**self.kwargs)
                 # self.task_queue.task_done()
@@ -82,7 +82,7 @@ class MultiThreadHandler(object):
                     _t = _taskHandler(self.task_queue,self.task_handler,self.result_queue,self.need_stop,*self.args,**self.kwargs)
                     _t.start()
                     restart += 1
-        logging.info("thread status alive:%d restart:%d total:%d need_stop %s"%(_count,restart,len(self.list_thread),str(self.need_stop)))
+        logging.debug("thread status alive:%d restart:%d total:%d need_stop %s"%(_count,restart,len(self.list_thread),str(self.need_stop)))
         return _count,restart,len(self.list_thread)
 
     def run(self):

+ 7 - 2
BaseDataMaintenance/dataSource/source.py

@@ -82,6 +82,7 @@ import platform
 is_internal = False
 if platform.system()=="Windows":
     OTS_URL = "https://bxkc-ots.cn-hangzhou.ots.aliyuncs.com"
+    OTS_URL = "https://bxkc-ots.cn-hangzhou.vpc.tablestore.aliyuncs.com"
 else:
     OTS_URL = "https://bxkc-ots.cn-hangzhou.vpc.tablestore.aliyuncs.com"
     check_url = "oss-cn-hangzhou-internal.aliyuncs.com"
@@ -160,6 +161,10 @@ if __name__=="__main__":
     # data = make_elasticSearch({"query":{"bool":{"must":[{"wildcard":{"nicknames.keyword":"*服装*"}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"aggs":{}})
     # print(data)
     # getConnect_neo4j()
-    conn = getConnection_oracle()
+    # conn = getConnection_oracle()
     # cursor = conn.cursor()
-    # getConnect_gdb()
+    # getConnect_gdb()
+    import sys,os
+    import platform
+    print(platform.system())
+    print(sys.platform)

+ 52 - 26
BaseDataMaintenance/maintenance/product/productUtils.py

@@ -1,6 +1,8 @@
 
 from BaseDataMaintenance.maintenance.product.product_setting import *
 
+
+import Levenshtein
 import re
 # 判断是不是入参字符串为全中文
 def judge_pur_chinese(keyword):
@@ -27,12 +29,26 @@ def get_chinese_string(string):
             s += ch
     return s
 
+
+def jaccard_score(source,target):
+    source_set = set([s for s in source])
+    target_set = set([s for s in target])
+    if len(source_set)==0 or len(target_set)==0:
+        return 0
+    return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
+
+
 from fuzzywuzzy import fuzz
 def is_similar(source,target):
     source = str(source).lower()
     target = str(target).lower()
     max_len = max(len(source),len(target))
     min_len = min(len(source),len(target))
+    min_ratio = 90
+    if min_len>=3:
+        min_ratio = 87
+    if min_len>=5:
+        min_ratio = 85
     # dis_len = abs(len(source)-len(target))
     # min_dis = min(max_len*0.2,4)
     if min_len==0 and max_len>0:
@@ -40,26 +56,33 @@ def is_similar(source,target):
     if max_len<=2:
         if source==target:
             return True
-    else:
-        #判断相似度
-        similar = fuzz.ratio(source,target)
-        if similar>90:
+    if min_len<2:
+        return False
+    #判断相似度
+    similar = fuzz.ratio(source,target)
+    if similar>=min_ratio:
+        return True
+    similar_jaro = Levenshtein.jaro(source,target)
+    if similar_jaro*100>=min_ratio:
+        return True
+    similar_jarow = Levenshtein.jaro_winkler(source,target)
+    if similar_jarow*100>=90:
+        return True
+
+    if min_len>=5:
+        if len(source)==max_len and str(source).find(target)>=0:
+                return True
+        elif len(target)==max_len and target.find(source)>=0:
+                return True
+        elif jaccard_score(source, target)==1 and judge_pur_chinese(source) and judge_pur_chinese(target):
+            return True
+    # 全中文判断是否包含
+    if len(source)==max_len and judge_pur_chinese(target):
+        if str(source).find(target)>=0:
+            return True
+    if len(target)==max_len and judge_pur_chinese(source):
+        if target.find(source)>=0:
             return True
-        # 全中文判断是否包含
-        if min_len>=6:
-            if len(source)==max_len:
-                if str(source).find(target)>=0:
-                    return True
-            else:
-                if target.find(source)>=0:
-                    return True
-        if judge_pur_chinese(source) and judge_pur_chinese(target):
-            if len(source)==max_len:
-                if str(source).find(target)>=0:
-                    return True
-            else:
-                if target.find(source)>=0:
-                    return True
     return False
 
 
@@ -114,13 +137,16 @@ def has_same_specs_count(source, target):
 
     return True
 
+SPECS_PATTERN = re.compile("[^A-Za-z0-9-\\/()().]")
 def is_legal_specs(specs):
     if specs is None or specs=="":
         return False
     specs = str(specs).lower()
-    for s in specs:
-        if s not in SPECS_CHECK_SET:
-            return False
+    if re.search(SPECS_PATTERN,specs) is not None:
+        return False
+    # for s in specs:
+    #     if re.search(SPECS_PATTERN,s) is not None:
+    #         return False
     return True
 
 
@@ -153,7 +179,6 @@ def check_specs(source,target):
     return False
 
 
-
 import json
 
 import requests
@@ -184,7 +209,7 @@ def clean_product_brand(product_brand):
     '''
     return product_brand
 
-SPECS_PATTERN = re.compile("[^A-Za-z0-9-\\/()().]")
+
 def clean_product_specs(product_specs):
     '''
     clean before insert
@@ -228,5 +253,6 @@ def clean_product_quantity(product_quantity):
     return ""
 
 if __name__ == '__main__':
-    print(clean_product_specs("XY-K-JLJ-3A"))
-    print(check_specs("3.6",'3.6'))
+    print(is_similar('空气波压力治疗仪','空气波治疗仪'))
+    import Levenshtein
+    print(Levenshtein.ratio('助听器','助行器'))

+ 128 - 30
BaseDataMaintenance/maintenance/product/products.py

@@ -9,6 +9,8 @@ from BaseDataMaintenance.maintenance.product.productUtils import *
 from BaseDataMaintenance.model.ots.document_product_tmp import *
 from BaseDataMaintenance.model.ots.document_product import *
 from BaseDataMaintenance.model.ots.document_product_dict import *
+from BaseDataMaintenance.model.ots.document import *
+from BaseDataMaintenance.model.ots.attachment import *
 
 from tablestore import *
 
@@ -162,11 +164,11 @@ class Product_Manager(Product_Dict_Manager):
                         name_ots_id = ots_id
                         new_name = ots_name
 
-                        #update alias of name
-                        _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:name_ots_id})
-                        _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
-                        if _flag and _dpd.updateAlias(name):
-                            _dpd.update_row(self.ots_client)
+                        # #update alias of name
+                        # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:name_ots_id})
+                        # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
+                        # if _flag and _dpd.updateAlias(name):
+                        #     _dpd.update_row(self.ots_client)
                         break
         if name_ots_id is not None:
             if brand is not None and brand!="":
@@ -212,16 +214,17 @@ class Product_Manager(Product_Dict_Manager):
                                                 DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
                                                 }
                                     _dpd_brand = Document_product_dict(_d_brand)
-                                    _dpd_brand.updateAlias(str(new_brand).lower())
+                                    # _dpd_brand.updateAlias(str(new_brand).lower())
                                     if not _dpd_brand.exists_row(self.ots_client):
                                         _dpd_brand.update_row(self.ots_client)
                                     else:
-                                        #update alias
-                                        _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:brand_ots_id})
-                                        _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
-                                        if _flag:
-                                            if _dpd.updateAlias(brand):
-                                                _dpd.update_row(self.ots_client)
+                                        pass
+                                        # #update alias
+                                        # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:brand_ots_id})
+                                        # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
+                                        # if _flag:
+                                        #     if _dpd.updateAlias(brand):
+                                        #         _dpd.update_row(self.ots_client)
 
                                 _find = True
                                 break
@@ -235,7 +238,7 @@ class Product_Manager(Product_Dict_Manager):
 
             if specs is not None and specs!="":
                 specs_vector = request_embedding(specs)
-                log("getting sepcs %s"%(specs))
+                debug("getting sepcs %s"%(specs))
                 if specs_vector is not None:
                     Coll,_ = self.get_collection(SPECS_GRADE)
                     search_list = search_embedding(Coll,embedding_index_name,[specs_vector],self.search_params,output_fields,limit=60)
@@ -268,16 +271,17 @@ class Product_Manager(Product_Dict_Manager):
                                                 DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
                                                 }
                                     _dpd_specs = Document_product_dict(_d_specs)
-                                    _dpd_specs.updateAlias(str(new_specs).lower())
+                                    # _dpd_specs.updateAlias(str(new_specs).lower())
                                     if not _dpd_specs.exists_row(self.ots_client):
                                         _dpd_specs.update_row(self.ots_client)
                                     else:
-                                        #update alias
-                                        _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:specs_ots_id})
-                                        _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
-                                        if _flag:
-                                            if _dpd.updateAlias(specs):
-                                                _dpd.update_row(self.ots_client)
+                                        pass
+                                        # #update alias
+                                        # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:specs_ots_id})
+                                        # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
+                                        # if _flag:
+                                        #     if _dpd.updateAlias(specs):
+                                        #         _dpd.update_row(self.ots_client)
                             else:
                                 # log("check_specs failed")
                                 new_specs = clean_product_specs(specs)
@@ -288,7 +292,7 @@ class Product_Manager(Product_Dict_Manager):
                                     _md5 = get_document_product_dict_id(brand_ots_id,new_specs)
                                     _d = {DOCUMENT_PRODUCT_DICT_ID:_md5,
                                           DOCUMENT_PRODUCT_DICT_NAME:new_specs,
-                                          DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(specs.lower()),
+                                          DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(new_specs.lower()),
                                           DOCUMENT_PRODUCT_DICT_GRADE:SPECS_GRADE,
                                           DOCUMENT_PRODUCT_DICT_STATUS:1,
                                           DOCUMENT_PRODUCT_DICT_PARENT_ID:brand_ots_id,
@@ -296,7 +300,7 @@ class Product_Manager(Product_Dict_Manager):
                                           DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
                                           }
                                     _dpd = Document_product_dict(_d)
-                                    _dpd.updateAlias(new_specs)
+                                    # _dpd.updateAlias(new_specs)
                                     _dpd.update_row(self.ots_client)
                             break
 
@@ -340,7 +344,7 @@ class Product_Manager(Product_Dict_Manager):
             _product.setValue(DOCUMENT_PRODUCT_UNIT_PRICE,unit_price,True)
             _product.setValue(DOCUMENT_PRODUCT_QUANTITY,quantity,True)
             if isinstance(unit_price,(float,int)) and isinstance(quantity,(float,int)):
-                total_price = "%.2f"%(unit_price*quantity)
+                total_price = float("%.2f"%(unit_price*quantity))
                 _product.setValue(DOCUMENT_PRODUCT_TOTAL_PRICE,total_price,True)
 
             new_id = self.get_product_id(docid,new_name,new_brand,new_specs,unit_price,quantity)
@@ -363,6 +367,15 @@ class Product_Manager(Product_Dict_Manager):
             _product.setValue(DOCUMENT_PRODUCT_BRANDSPECS,"%s&&%s"%(new_brand,new_specs),True)
             _product.setValue(DOCUMENT_PRODUCT_FULL_NAME,"%s&&%s&&%s"%(new_name,new_brand,new_specs),True)
 
+            _product.setValue(DOCUMENT_PRODUCT_CREATE_TIME,getCurrent_date(format="%Y-%m-%d %H:%M:%S"),True)
+
+            _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_NAME,document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_NAME,""),True)
+            _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_BRAND,document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_BRAND,""),True)
+            _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_SPECS,document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_SPECS,""),True)
+
+            bid_filemd5s = self.get_bid_filemd5s(docid,self.ots_client)
+            if bid_filemd5s is not None:
+                _product.setValue(DOCUMENT_PRODUCT_BID_FILEMD5S,bid_filemd5s,True)
 
             if self.dumplicate(_product):
                 _status = randint(201,300)
@@ -379,6 +392,54 @@ class Product_Manager(Product_Dict_Manager):
         save_product_tmp.setValue(DOCUMENT_PRODUCT_TMP_STATUS,_status,True)
         save_product_tmp.update_row(self.ots_client)
 
+    @staticmethod
+    def get_bid_filemd5s(docid,ots_client):
+
+        bool_query = BoolQuery(must_queries=[
+            TermQuery("docids",docid)
+        ])
+        rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
+                                                                            SearchQuery(bool_query,limit=10),
+                                                                            columns_to_get=ColumnsToGet(["docids"],return_type=ColumnReturnType.SPECIFIED))
+        list_data = getRow_ots(rows)
+
+        list_bid_filemd5s = []
+        set_docids = set([docid])
+        set_md5s = set()
+
+        for _d in list_data:
+            try:
+                docids = _d.get("docids","")
+                for _id in docids.split(","):
+                    set_docids.add(int(_id))
+            except Exception as e:
+                pass
+        list_docids = list(set_docids)
+        for _docid in list_docids:
+            _d = {document_partitionkey:_docid%500+1,
+                  document_docid:_docid}
+            _doc = Document(_d)
+            _doc.fix_columns(ots_client,[document_attachment_path],True)
+            page_attachments = _doc.getProperties().get(document_attachment_path)
+            if page_attachments is not None and page_attachments!="":
+                attachments = json.loads(page_attachments)
+                for _a in attachments:
+                    _filemd5 = _a.get(document_attachment_path_filemd5)
+                    if _filemd5 in set_md5s:
+                        continue
+                    set_md5s.add(_filemd5)
+                    _da = {attachment_filemd5:_filemd5}
+                    _attach = attachment(_da)
+                    _attach.fix_columns(ots_client,[attachment_classification],True)
+                    if _attach.getProperties().get(attachment_classification,"")=="招标文件":
+                        list_bid_filemd5s.append(_filemd5)
+        if len(list_bid_filemd5s)==0:
+            return None
+        return ",".join(list(set(list_bid_filemd5s)))
+
+
+
+
     def get_value_count(self,name,brand,specs,unit_price,quantity):
 
         value_count = 0
@@ -432,7 +493,7 @@ class Product_Manager(Product_Dict_Manager):
                 return list_data[0].get(DOCUMENT_PRODUCT_ID),1
 
         if len(name)>0 and len(brand)>0 and len(supplier)>0 and len(tenderee)>0:
-            log("docid %s name %s page_time_before %s page_time_after %s brand %s supplier %s tenderee %s"%(str(docid),name,page_time_before,page_time_after,brand,supplier,tenderee))
+            # log("docid %s name %s page_time_before %s page_time_after %s brand %s supplier %s tenderee %s"%(str(docid),name,page_time_before,page_time_after,brand,supplier,tenderee))
             bool_query = BoolQuery(must_queries=[TermQuery("name",name),
                                                  RangeQuery("page_time",page_time_before,page_time_after,True,True),
                                                  TermQuery(DOCUMENT_PRODUCT_BRAND,brand),
@@ -531,22 +592,24 @@ def start_process_product():
     pm.start_processing()
 
 def fix_product_data():
+
     '''
     # delete document_product and change the record status to 1 in document_product_temp which id=original id
     :return:
     '''
     ots_client = getConnect_ots()
-    bool_query = BoolQuery(must_queries=[RangeQuery("status",1)])
+    bool_query = BoolQuery(must_queries=[TermQuery("docid",309258275)
+                                         ])
 
     rows,next_token,total_count,is_all_succeed = ots_client.search("document_product","document_product_index",
                                                                    SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
-                                                                   columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID],return_type=ColumnReturnType.SPECIFIED))
+                                                                   columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DOCID,DOCUMENT_PRODUCT_PROJECT_NAME],return_type=ColumnReturnType.SPECIFIED))
 
     list_rows = getRow_ots(rows)
     while next_token:
         rows,next_token,total_count,is_all_succeed = ots_client.search('document_product','document_product_index',
                                                                        SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
-                                                                       columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID],return_type=ColumnReturnType.SPECIFIED))
+                                                                       columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DOCID,DOCUMENT_PRODUCT_PROJECT_NAME],return_type=ColumnReturnType.SPECIFIED))
         list_rows.extend(getRow_ots(rows))
 
     task_queue = Queue()
@@ -555,13 +618,42 @@ def fix_product_data():
     def handle(item,result_queue):
         original_id = item.get(DOCUMENT_PRODUCT_ORIGINAL_ID)
 
+        # # delete data and rerun
+        # _d = {DOCUMENT_PRODUCT_TMP_ID:original_id,DOCUMENT_PRODUCT_TMP_STATUS:1}
+        # dpt = Document_product_tmp(_d)
+        # dpt.update_row(ots_client)
+        #
+        # _d = {DOCUMENT_PRODUCT_ID:item.get(DOCUMENT_PRODUCT_ID)}
+        # dp = Document_product(_d)
+        # dp.delete_row(ots_client)
+
+
         _d = {DOCUMENT_PRODUCT_TMP_ID:original_id,DOCUMENT_PRODUCT_TMP_STATUS:1}
         dpt = Document_product_tmp(_d)
-        dpt.update_row(ots_client)
+        dpt.fix_columns(ots_client,["name","brand","specs"],True)
 
         _d = {DOCUMENT_PRODUCT_ID:item.get(DOCUMENT_PRODUCT_ID)}
         dp = Document_product(_d)
-        dp.delete_row(ots_client)
+
+        #fix the project_code and original_name and bidi_filemd5s
+        docid = int(item.get(DOCUMENT_PRODUCT_DOCID))
+        partitionkey = docid%500+1
+        project_name = item.get(DOCUMENT_PRODUCT_PROJECT_NAME,"")
+        if project_name=="":
+            #fix project_name
+            _doc = Document({"partitionkey":partitionkey,
+                             "docid":docid})
+            _doc.fix_columns(ots_client,["doctitle"],True)
+            dp.setValue(DOCUMENT_PRODUCT_DOCTITLE,_doc.getProperties().get("doctitle"),True)
+        bid_filemd5s = Product_Manager.get_bid_filemd5s(docid,ots_client)
+        if bid_filemd5s is not None:
+            dp.setValue(DOCUMENT_PRODUCT_BID_FILEMD5S,bid_filemd5s,True)
+
+        dp.setValue(DOCUMENT_PRODUCT_ORIGINAL_NAME,dpt.getProperties().get(DOCUMENT_PRODUCT_TMP_NAME,""),True)
+        dp.setValue(DOCUMENT_PRODUCT_ORIGINAL_BRAND,dpt.getProperties().get(DOCUMENT_PRODUCT_TMP_BRAND,""),True)
+        dp.setValue(DOCUMENT_PRODUCT_ORIGINAL_SPECS,dpt.getProperties().get(DOCUMENT_PRODUCT_TMP_SPECS,""),True)
+        dp.update_row(ots_client)
+
 
     mt = MultiThreadHandler(task_queue,handle,None,30,1)
     mt.run()
@@ -575,4 +667,10 @@ def test():
 if __name__ == '__main__':
 
     # start_process_product()
-    print(getMD5('11936c56f2dd1426764e317ca2e8e1a7'+'&&鱼跃'))
+    # print(getMD5('11936c56f2dd1426764e317ca2e8e1a7'+'&&鱼跃'))
+    test()
+    print(Product_Manager.get_bid_filemd5s(174802483,getConnect_ots()))
+    name = "一"
+    ots_name = "一氧化碳分析仪"
+    print(is_similar(name,ots_name),check_product(name,ots_name))
+    print(is_legal_specs('SCM-A/SB(0.18D)'))

+ 5 - 2
BaseDataMaintenance/model/ots/document_product.py

@@ -42,6 +42,11 @@ DOCUMENT_PRODUCT_DICT_BRAND_ID = "dict_brand_id"
 DOCUMENT_PRODUCT_DICT_SPECS_ID = "dict_specs_id"
 
 DOCUMENT_PRODUCT_DUMP_ID = "dump_id"
+DOCUMENT_PRODUCT_ORIGINAL_NAME = "original_name"
+DOCUMENT_PRODUCT_ORIGINAL_BRAND = "original_brand"
+DOCUMENT_PRODUCT_ORIGINAL_SPECS = "original_specs"
+
+DOCUMENT_PRODUCT_BID_FILEMD5S = "bid_filemd5s"
 
 
 
@@ -62,8 +67,6 @@ class Document_product(BaseModel):
 
 
 
-
-
         self.table_name = 'document_product'
 
     def getPrimary_keys(self):