Kaynağa Gözat

要素提取补充AI提取

luojiehua 2 ay önce
ebeveyn
işleme
457f56fe7d

+ 3 - 30
BaseDataMaintenance/maintenance/2.py

@@ -1,36 +1,9 @@
 #coding:utf8
 
-from bs4 import BeautifulSoup
 import re
 
-def html2text(_html):
-    # 如果输入是字符串,使用 BeautifulSoup 解析
-    if isinstance(_html, str):
-        _soup = BeautifulSoup(_html, "lxml")
-    else:
-        _soup = _html
+a = '19028,919.1万元'
 
-    # 用于存储处理后的文本
-    result_parts = []
+b = re.search(r'[\d,,\.]+[亿万元人民币]+',a)
 
-    _find = False
-    # 遍历所有直接子元素
-    for child in _soup.find_all(recursive=False):
-        if child.name in ["table", "tbody"]:
-            # 如果是表格或表格主体,保留 HTML 代码
-            result_parts.append(str(child))
-        else:
-            # 递归处理其他元素并转换为文本
-            text = html2text(child)
-            result_parts.append(text)
-        _find = True
-    if not _find:
-        result_parts.append(str(_soup.get_text()))
-
-    # 将所有处理后的部分连接成一个字符串
-    result = "\n".join(result_parts)
-    return result
-
-if __name__ == '__main__':
-    _html = "<div><p>这是一个p</p><table><tr><td>这是一个td</td></tr></table></div>"
-    print(html2text(_html))
+print(b.group())

+ 8 - 1
BaseDataMaintenance/maintenance/dataflow_mq.py

@@ -1534,6 +1534,11 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
 
 
     def merge_json(self,extract_json,extract_ai_json):
+
+        def get_ai_money(_text):
+            b = re.search(r'[\d,,\.]+[亿万元人民币]+',_text)
+            if b is not None:
+                return b.group()
         _extract = {}
         if extract_json is not None:
             try:
@@ -1620,6 +1625,7 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
 
         if not has_budget or budget_unexpected:
             _budget = _extract_ai.get("招标信息",{}).get("项目预算","")
+            _budget = get_ai_money(_budget)
             if _budget is not None and _budget!="":
                 _budget = getUnifyMoney(_budget)
                 if _budget>0:
@@ -1647,7 +1653,8 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
                         else:
                             _pack = "Project"
                     _win_money = _win_dict.get("中标金额")
-                    if _win_money!="":
+                    _win_money = get_ai_money(_win_money)
+                    if _win_money is not None and _win_money!="":
                         _win_money = getUnifyMoney(_win_money)
                     else:
                         _win_money = 0