Ver Fonte

保留单个的空格以解决预处理中时间被分割的问题,联系人去重maxcompute代码

rogel há 4 anos atrás
pai
commit
6a5f8dd152

+ 35 - 38
BiddingKG/dl/interface/Preprocessing.py

@@ -628,7 +628,7 @@ def tableToText(soup):
         # packPattern = "(标包|[标包][号段名])"
         packPattern = "(标包|[标包][号段名]|((项目|物资|设备|场次|标段|标的|产品)(名称)))"  # 2020/11/23 大网站规则,补充采购类包名
         rankPattern = "(排名|排序|名次|序号|评标结果|评审结果|是否中标)"  # 2020/11/23 大网站规则,添加序号为排序
-        entityPattern = "(候选|([中投]标|报价)(人|单位|候选)|单位名称|供应商)"
+        entityPattern = "(候选|([中投]标|报价)|单位名称|供应商|金额)"
         height = len(inner_table)
         width = len(inner_table[0])
         text = ""
@@ -989,9 +989,9 @@ def tableToText(soup):
 
 #数据清洗
 def segment(soup):
-    # print("==")
-    # print(soup)
-    # print("====")
+    print("==")
+    print(soup)
+    print("====")
     #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
     if soup.name=="td":
         #判断有值叶子节点数
@@ -1006,21 +1006,21 @@ def segment(soup):
                 if '...' in soup.get_text() and (soup.get_text()[:-3]).strip() in soup.attrs['title']:
                     text = soup.attrs['title']
 
-            _list = []
-            for x in re.split("\s+",text):
-                if x.strip()!="":
-                    _list.append(len(x))
-            if len(_list)>0:
-                _minLength = min(_list)
-                if _minLength>2:
-                    _substr = ","
-                else:
-                    _substr = ""
-            else:
-                _substr = ""
-            text = _substr.join(re.split("(\s+)",text))
+            # _list = []
+            # for x in re.split("\s+",text):
+            #     if x.strip()!="":
+            #         _list.append(len(x))
+            # if len(_list)>0:
+            #     _minLength = min(_list)
+            #     if _minLength>2:
+            #         _substr = ","
+            #     else:
+            #         _substr = ""
+            # else:
+            #     _substr = ""
+            # text = _substr.join(re.split("(\s+)",text))
             text = text.replace("\r\n",",").replace("\n",",")
-            text = re.sub("^[,\s]*|[,\s]*$","",text)
+            # text = re.sub("^[,\s]*|[,\s]*$","",text)
             return text
     segList = ["title"]
     commaList = ["div","br","td","p"]
@@ -1052,23 +1052,7 @@ def segment(soup):
     text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text)
     
          
-    #删除标签中的所有空格
-    for subs in subspaceList:
-        patten = "#subs"+str(subs)+"#(.*?)#sube"+str(subs)+"#"
-        while(True):
-            oneMatch = re.search(re.compile(patten),text)
-            if oneMatch is not None:
-                _match = oneMatch.group(1)
-                _minLength = min([len(x) for x in re.split("(\s*)",_match)])
-                if _minLength>2:
-                    _substr = ","
-                else:
-                    _substr = ""
-                text = text.replace("#subs"+str(subs)+"#"+oneMatch.group(1)+"#sube"+str(subs)+"#",re.sub("\s",_substr,oneMatch.group(1)))
-            else:
-                break
-    
-    
+
     #替换"""为"“",否则导入deepdive出错
     text = text.replace('"',"“").replace("\r","").replace("\n",",")
     text = re.sub("\s{4,}",",",text)   
@@ -1076,26 +1060,39 @@ def segment(soup):
 
     #替换连续的标点
 
-    punc_pattern = "(?P<del>[。,;::,\s]+)"
+    punc_pattern = "(?P<del>[。,;::,\s]{2,})"
 
     list_punc = re.findall(punc_pattern,text)
     list_punc.sort(key=lambda x:len(x),reverse=True)
     for punc_del in list_punc:
         if len(punc_del)>1:
-            text = re.sub(punc_del+"\s*",punc_del[-1],text)
+            text = re.sub(punc_del,punc_del[-1],text)
         
 
     #将连续的中文句号替换为一个
     text_split = text.split("。")
     text_split = [x for x in text_split if len(x)>0]
     text = "。".join(text_split)
+
+    # #删除标签中的所有空格
+    # for subs in subspaceList:
+    #     patten = "#subs"+str(subs)+"#(.*?)#sube"+str(subs)+"#"
+    #     while(True):
+    #         oneMatch = re.search(re.compile(patten),text)
+    #         if oneMatch is not None:
+    #             _match = oneMatch.group(1)
+    #             text = text.replace("#subs"+str(subs)+"#"+_match+"#sube"+str(subs)+"#",_match)
+    #         else:
+    #             break
+
     # text过大报错
     LOOP_LEN = 10000
     LOOP_BEGIN = 0
     _text = ""
+
     if len(text)<10000000:
         while(LOOP_BEGIN<len(text)):
-            _text += re.sub(")",")",re.sub("(","(",re.sub("\s*","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
+            _text += re.sub(")",")",re.sub("(","(",re.sub("\s{2,}","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
             LOOP_BEGIN += LOOP_LEN
     else:
         return text

+ 169 - 0
BiddingKG/maxcompute/contactDumplicate.py

@@ -0,0 +1,169 @@
+from odps.udf import annotate
+from odps.udf import BaseUDAF
+from odps.udf import BaseUDTF
+
+@annotate('string,string,string,string,bigint,datetime,string,string,string,string->string')
+class dumplicate(BaseUDAF):
+
+    def __init__(self):
+        import datetime
+        import json
+        import logging
+        global datetime,json,logging,MyEncoder
+
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        class MyEncoder(json.JSONEncoder):
+
+            def default(self, obj):
+                if isinstance(obj, bytes):
+                    return str(obj, encoding='utf-8')
+                return json.JSONEncoder.default(self, obj)
+
+    def new_buffer(self):
+        return [[]]
+
+    def iterate(self, buffer, company_name,mobile_no,phone_no,contact_person,level,create_time,email,company_addr,province,city):
+        logging.info(company_name)
+        buffer[0].append([company_name.strip(),mobile_no,phone_no,contact_person,level,create_time.timestamp(),email,company_addr,province,city])
+        logging.info(company_name)
+
+    def merge(self, buffer, pbuffer):
+        logging.info('-3=')
+        buffer[0].extend(pbuffer[0])
+        logging.info('-4=')
+
+    def terminate(self, buffer):
+        logging.info('-1=')
+        buffer[0].sort(key=lambda x:x[5],reverse=True)
+        company_name,mobile_no,phone_no,contact_person,level,create_time,email,company_addr,province,city = buffer[0][0]
+        logging.info("-2=")
+        return json.dumps([company_name,mobile_no,phone_no,contact_person,level,create_time,email,company_addr,province,city],cls=MyEncoder,ensure_ascii=False)
+
+
+@annotate("string->string,string,string,string,bigint,datetime,string,string,string,string")
+class liberate(BaseUDTF):
+
+    def __init__(self):
+        import json
+        import time
+        import logging
+        import datetime
+        # import sys
+        # reload(sys)
+        # sys.setdefaultencoding('utf8')
+        global json,MyEncoder,logging,time,datetime
+
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        class MyEncoder(json.JSONEncoder):
+
+            def default(self, obj):
+                if isinstance(obj, bytes):
+                    return str(obj, encoding='utf-8')
+                return json.JSONEncoder.default(self, obj)
+
+
+    def process(self, json_dumplicate):
+        try:
+            logging.info(json_dumplicate)
+            json_dumplicate = json_dumplicate.replace("\\n","").replace('\\"','').replace("\\r","")
+            company_name,mobile_no,phone_no,contact_person,level,create_time,email,company_addr,province,city = json.loads(json_dumplicate)
+            create_time = datetime.datetime.fromtimestamp(create_time)
+            self.forward(company_name,mobile_no,phone_no,contact_person,level,create_time,email,company_addr,province,city)
+        except Exception as e:
+            pass
+
+import re
+mobile_pattern = re.compile("^1\d{10}$")
+def recog_likeType(phone):
+    if re.search(mobile_pattern,phone) is not None:
+        return "mobile"
+    else:
+        return "phone"
+
+@annotate("string,string,string,string,string,string->string")
+class f_tojson_docuentContact(object):
+
+    def __init__(self):
+        import json
+        global json
+
+
+    def evaluate(self, tenderee,tenderee_contact,tenderee_phone,agency,agency_contact,agency_phone):
+        list_contact = []
+        if tenderee!="" and tenderee_contact!="" and tenderee_phone!='' and tenderee_phone is not None:
+            _dict = {"company":tenderee,"contact_person":tenderee_contact,"level":20}
+            if recog_likeType(tenderee_phone)=="mobile":
+                _dict["mobile_no"] = tenderee_phone
+            else:
+                _dict["phone_no"] = tenderee_phone
+            list_contact.append(_dict)
+        if agency!="" and agency_contact!="" and agency_phone!='' and agency_phone is not None:
+            _dict = {"company":agency,"contact_person":agency_contact,"level":20}
+            if recog_likeType(agency_phone)=="mobile":
+                _dict["mobile_no"] = agency_phone
+            else:
+                _dict["phone_no"] = agency_phone
+            list_contact.append(_dict)
+        return json.dumps(list_contact)
+
+@annotate("string->string,string,string,string,bigint,string")
+class f_liberate_contactJson(BaseUDTF):
+
+    def __init__(self):
+        import json
+        import time
+        import logging
+        import datetime
+        # import sys
+        # reload(sys)
+        # sys.setdefaultencoding('utf8')
+        global json,MyEncoder,logging,time,datetime
+
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+
+    def process(self, json_contact):
+        try:
+            list_dict = json.loads(json_contact)
+            for _dict in list_dict:
+                company = _dict.get("company")
+                contact_person = _dict.get("contact_person")
+                mobile_no = _dict.get("mobile_no","")
+                if mobile_no is None:
+                    mobile_no = ""
+                phone_no = _dict.get("phone_no","")
+                if phone_no is None:
+                    phone_no = ""
+                else:
+                    phone_no = re.sub('[^0-9\-转]','',phone_no)
+                    if len(phone_no)<6:
+                        phone_no = ""
+                level = _dict.get("level")
+                mail = _dict.get("mail","")
+                self.forward(company,contact_person,mobile_no,phone_no,level,mail)
+        except Exception as e:
+            logging.info(str(e))
+            logging.info(json_contact)
+
+@annotate('string->bigint')
+class f_count_company(BaseUDAF):
+
+    def __init__(self):
+        import datetime
+        import json
+        import logging
+        global datetime,json,logging,MyEncoder
+
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def new_buffer(self):
+        return [set()]
+
+    def iterate(self, buffer, company_name):
+        buffer[0].add(company_name)
+
+    def merge(self, buffer, pbuffer):
+        buffer[0] |= pbuffer[0]
+
+    def terminate(self, buffer):
+        return len(buffer[0])