4 years ago · 6a5f8dd152
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -628,7 +628,7 @@ def tableToText(soup):
 
				         # packPattern = "(标包|[标包][号段名])"
			
 
				         packPattern = "(标包|[标包][号段名]|((项目|物资|设备|场次|标段|标的|产品)(名称)))"  # 2020/11/23 大网站规则，补充采购类包名
			
 
				         rankPattern = "(排名|排序|名次|序号|评标结果|评审结果|是否中标)"  # 2020/11/23 大网站规则，添加序号为排序
			
 
				-        entityPattern = "(候选|([中投]标|报价)(人|单位|候选)|单位名称|供应商)"
			
 
				+        entityPattern = "(候选|([中投]标|报价)|单位名称|供应商|金额)"
			
 
				         height = len(inner_table)
			
 
				         width = len(inner_table[0])
			
 
				         text = ""
			
@@ -989,9 +989,9 @@ def tableToText(soup):
 
				 
			
 
				 #数据清洗
			
 
				 def segment(soup):
			
 
				-    # print("==")
			
 
				-    # print(soup)
			
 
				-    # print("====")
			
 
				+    print("==")
			
 
				+    print(soup)
			
 
				+    print("====")
			
 
				     #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
			
 
				     if soup.name=="td":
			
 
				         #判断有值叶子节点数
			
@@ -1006,21 +1006,21 @@ def segment(soup):
 
				                 if '...' in soup.get_text() and (soup.get_text()[:-3]).strip() in soup.attrs['title']:
			
 
				                     text = soup.attrs['title']
			
 
				 
			
 
				-            _list = []
			
 
				-            for x in re.split("\s+",text):
			
 
				-                if x.strip()!="":
			
 
				-                    _list.append(len(x))
			
 
				-            if len(_list)>0:
			
 
				-                _minLength = min(_list)
			
 
				-                if _minLength>2:
			
 
				-                    _substr = "，"
			
 
				-                else:
			
 
				-                    _substr = ""
			
 
				-            else:
			
 
				-                _substr = ""
			
 
				-            text = _substr.join(re.split("(\s+)",text))
			
 
				+            # _list = []
			
 
				+            # for x in re.split("\s+",text):
			
 
				+            #     if x.strip()!="":
			
 
				+            #         _list.append(len(x))
			
 
				+            # if len(_list)>0:
			
 
				+            #     _minLength = min(_list)
			
 
				+            #     if _minLength>2:
			
 
				+            #         _substr = "，"
			
 
				+            #     else:
			
 
				+            #         _substr = ""
			
 
				+            # else:
			
 
				+            #     _substr = ""
			
 
				+            # text = _substr.join(re.split("(\s+)",text))
			
 
				             text = text.replace("\r\n","，").replace("\n","，")
			
 
				-            text = re.sub("^[，\s]*|[，\s]*$","",text)
			
 
				+            # text = re.sub("^[，\s]*|[，\s]*$","",text)
			
 
				             return text
			
 
				     segList = ["title"]
			
 
				     commaList = ["div","br","td","p"]
			
@@ -1052,23 +1052,7 @@ def segment(soup):
 
				     text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])","；",text)
			
 
				     
			
 
				          
			
 
				-    #删除标签中的所有空格
			
 
				-    for subs in subspaceList:
			
 
				-        patten = "#subs"+str(subs)+"#(.*?)#sube"+str(subs)+"#"
			
 
				-        while(True):
			
 
				-            oneMatch = re.search(re.compile(patten),text)
			
 
				-            if oneMatch is not None:
			
 
				-                _match = oneMatch.group(1)
			
 
				-                _minLength = min([len(x) for x in re.split("(\s*)",_match)])
			
 
				-                if _minLength>2:
			
 
				-                    _substr = "，"
			
 
				-                else:
			
 
				-                    _substr = ""
			
 
				-                text = text.replace("#subs"+str(subs)+"#"+oneMatch.group(1)+"#sube"+str(subs)+"#",re.sub("\s",_substr,oneMatch.group(1)))
			
 
				-            else:
			
 
				-                break
			
 
				-    
			
 
				-    
			
 
				+
			
 
				     #替换"""为"“",否则导入deepdive出错
			
 
				     text = text.replace('"',"“").replace("\r","").replace("\n","，")
			
 
				     text = re.sub("\s{4,}","，",text)   
			
@@ -1076,26 +1060,39 @@ def segment(soup):
 
				 
			
 
				     #替换连续的标点
			
 
				 
			
 
				-    punc_pattern = "(?P<del>[。，；：:,\s]+)"
			
 
				+    punc_pattern = "(?P<del>[。，；：:,\s]{2,})"
			
 
				 
			
 
				     list_punc = re.findall(punc_pattern,text)
			
 
				     list_punc.sort(key=lambda x:len(x),reverse=True)
			
 
				     for punc_del in list_punc:
			
 
				         if len(punc_del)>1:
			
 
				-            text = re.sub(punc_del+"\s*",punc_del[-1],text)
			
 
				+            text = re.sub(punc_del,punc_del[-1],text)
			
 
				         
			
 
				 
			
 
				     #将连续的中文句号替换为一个
			
 
				     text_split = text.split("。")
			
 
				     text_split = [x for x in text_split if len(x)>0]
			
 
				     text = "。".join(text_split)
			
 
				+
			
 
				+    # #删除标签中的所有空格
			
 
				+    # for subs in subspaceList:
			
 
				+    #     patten = "#subs"+str(subs)+"#(.*?)#sube"+str(subs)+"#"
			
 
				+    #     while(True):
			
 
				+    #         oneMatch = re.search(re.compile(patten),text)
			
 
				+    #         if oneMatch is not None:
			
 
				+    #             _match = oneMatch.group(1)
			
 
				+    #             text = text.replace("#subs"+str(subs)+"#"+_match+"#sube"+str(subs)+"#",_match)
			
 
				+    #         else:
			
 
				+    #             break
			
 
				+
			
 
				     # text过大报错
			
 
				     LOOP_LEN = 10000
			
 
				     LOOP_BEGIN = 0
			
 
				     _text = ""
			
 
				+
			
 
				     if len(text)<10000000:
			
 
				         while(LOOP_BEGIN<len(text)):
			
 
				-            _text += re.sub("）",")",re.sub("（","(",re.sub("\s*","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
			
 
				+            _text += re.sub("）",")",re.sub("（","(",re.sub("\s{2,}","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
			
 
				             LOOP_BEGIN += LOOP_LEN
			
 
				     else:
			
 
				         return text
			
--- a/BiddingKG/maxcompute/contactDumplicate.py
+++ b/BiddingKG/maxcompute/contactDumplicate.py
@@ -0,0 +1,169 @@
 
				+from odps.udf import annotate
			
 
				+from odps.udf import BaseUDAF
			
 
				+from odps.udf import BaseUDTF
			
 
				+
			
 
				+@annotate('string,string,string,string,bigint,datetime,string,string,string,string->string')
			
 
				+class dumplicate(BaseUDAF):
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        import datetime
			
 
				+        import json
			
 
				+        import logging
			
 
				+        global datetime,json,logging,MyEncoder
			
 
				+
			
 
				+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				+        class MyEncoder(json.JSONEncoder):
			
 
				+
			
 
				+            def default(self, obj):
			
 
				+                if isinstance(obj, bytes):
			
 
				+                    return str(obj, encoding='utf-8')
			
 
				+                return json.JSONEncoder.default(self, obj)
			
 
				+
			
 
				+    def new_buffer(self):
			
 
				+        return [[]]
			
 
				+
			
 
				+    def iterate(self, buffer, company_name,mobile_no,phone_no,contact_person,level,create_time,email,company_addr,province,city):
			
 
				+        logging.info(company_name)
			
 
				+        buffer[0].append([company_name.strip(),mobile_no,phone_no,contact_person,level,create_time.timestamp(),email,company_addr,province,city])
			
 
				+        logging.info(company_name)
			
 
				+
			
 
				+    def merge(self, buffer, pbuffer):
			
 
				+        logging.info('-3=')
			
 
				+        buffer[0].extend(pbuffer[0])
			
 
				+        logging.info('-4=')
			
 
				+
			
 
				+    def terminate(self, buffer):
			
 
				+        logging.info('-1=')
			
 
				+        buffer[0].sort(key=lambda x:x[5],reverse=True)
			
 
				+        company_name,mobile_no,phone_no,contact_person,level,create_time,email,company_addr,province,city = buffer[0][0]
			
 
				+        logging.info("-2=")
			
 
				+        return json.dumps([company_name,mobile_no,phone_no,contact_person,level,create_time,email,company_addr,province,city],cls=MyEncoder,ensure_ascii=False)
			
 
				+
			
 
				+
			
 
				+@annotate("string->string,string,string,string,bigint,datetime,string,string,string,string")
			
 
				+class liberate(BaseUDTF):
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        import json
			
 
				+        import time
			
 
				+        import logging
			
 
				+        import datetime
			
 
				+        # import sys
			
 
				+        # reload(sys)
			
 
				+        # sys.setdefaultencoding('utf8')
			
 
				+        global json,MyEncoder,logging,time,datetime
			
 
				+
			
 
				+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				+        class MyEncoder(json.JSONEncoder):
			
 
				+
			
 
				+            def default(self, obj):
			
 
				+                if isinstance(obj, bytes):
			
 
				+                    return str(obj, encoding='utf-8')
			
 
				+                return json.JSONEncoder.default(self, obj)
			
 
				+
			
 
				+
			
 
				+    def process(self, json_dumplicate):
			
 
				+        try:
			
 
				+            logging.info(json_dumplicate)
			
 
				+            json_dumplicate = json_dumplicate.replace("\\n","").replace('\\"','').replace("\\r","")
			
 
				+            company_name,mobile_no,phone_no,contact_person,level,create_time,email,company_addr,province,city = json.loads(json_dumplicate)
			
 
				+            create_time = datetime.datetime.fromtimestamp(create_time)
			
 
				+            self.forward(company_name,mobile_no,phone_no,contact_person,level,create_time,email,company_addr,province,city)
			
 
				+        except Exception as e:
			
 
				+            pass
			
 
				+
			
 
				+import re
			
 
				+mobile_pattern = re.compile("^1\d{10}$")
			
 
				+def recog_likeType(phone):
			
 
				+    if re.search(mobile_pattern,phone) is not None:
			
 
				+        return "mobile"
			
 
				+    else:
			
 
				+        return "phone"
			
 
				+
			
 
				+@annotate("string,string,string,string,string,string->string")
			
 
				+class f_tojson_docuentContact(object):
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        import json
			
 
				+        global json
			
 
				+
			
 
				+
			
 
				+    def evaluate(self, tenderee,tenderee_contact,tenderee_phone,agency,agency_contact,agency_phone):
			
 
				+        list_contact = []
			
 
				+        if tenderee!="" and tenderee_contact!="" and tenderee_phone!='' and tenderee_phone is not None:
			
 
				+            _dict = {"company":tenderee,"contact_person":tenderee_contact,"level":20}
			
 
				+            if recog_likeType(tenderee_phone)=="mobile":
			
 
				+                _dict["mobile_no"] = tenderee_phone
			
 
				+            else:
			
 
				+                _dict["phone_no"] = tenderee_phone
			
 
				+            list_contact.append(_dict)
			
 
				+        if agency!="" and agency_contact!="" and agency_phone!='' and agency_phone is not None:
			
 
				+            _dict = {"company":agency,"contact_person":agency_contact,"level":20}
			
 
				+            if recog_likeType(agency_phone)=="mobile":
			
 
				+                _dict["mobile_no"] = agency_phone
			
 
				+            else:
			
 
				+                _dict["phone_no"] = agency_phone
			
 
				+            list_contact.append(_dict)
			
 
				+        return json.dumps(list_contact)
			
 
				+
			
 
				+@annotate("string->string,string,string,string,bigint,string")
			
 
				+class f_liberate_contactJson(BaseUDTF):
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        import json
			
 
				+        import time
			
 
				+        import logging
			
 
				+        import datetime
			
 
				+        # import sys
			
 
				+        # reload(sys)
			
 
				+        # sys.setdefaultencoding('utf8')
			
 
				+        global json,MyEncoder,logging,time,datetime
			
 
				+
			
 
				+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				+
			
 
				+
			
 
				+    def process(self, json_contact):
			
 
				+        try:
			
 
				+            list_dict = json.loads(json_contact)
			
 
				+            for _dict in list_dict:
			
 
				+                company = _dict.get("company")
			
 
				+                contact_person = _dict.get("contact_person")
			
 
				+                mobile_no = _dict.get("mobile_no","")
			
 
				+                if mobile_no is None:
			
 
				+                    mobile_no = ""
			
 
				+                phone_no = _dict.get("phone_no","")
			
 
				+                if phone_no is None:
			
 
				+                    phone_no = ""
			
 
				+                else:
			
 
				+                    phone_no = re.sub('[^0-9\-转]','',phone_no)
			
 
				+                    if len(phone_no)<6:
			
 
				+                        phone_no = ""
			
 
				+                level = _dict.get("level")
			
 
				+                mail = _dict.get("mail","")
			
 
				+                self.forward(company,contact_person,mobile_no,phone_no,level,mail)
			
 
				+        except Exception as e:
			
 
				+            logging.info(str(e))
			
 
				+            logging.info(json_contact)
			
 
				+
			
 
				+@annotate('string->bigint')
			
 
				+class f_count_company(BaseUDAF):
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        import datetime
			
 
				+        import json
			
 
				+        import logging
			
 
				+        global datetime,json,logging,MyEncoder
			
 
				+
			
 
				+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				+
			
 
				+    def new_buffer(self):
			
 
				+        return [set()]
			
 
				+
			
 
				+    def iterate(self, buffer, company_name):
			
 
				+        buffer[0].add(company_name)
			
 
				+
			
 
				+    def merge(self, buffer, pbuffer):
			
 
				+        buffer[0] |= pbuffer[0]
			
 
				+
			
 
				+    def terminate(self, buffer):
			
 
				+        return len(buffer[0])