fix.py 3.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. '''
  2. Created on 2019年10月12日
  3. @author: User
  4. '''
  5. import glob
  6. from bs4 import BeautifulSoup
  7. import re
  8. import shutil
  9. import codecs
  10. from BiddingKG.dl_dev.test.test4 import predict
  11. from BiddingKG.dl.common.Utils import save, load
  12. def tofix():
  13. '''
  14. @summary: 获取需要fix的数据
  15. '''
  16. #paths = ["C:\\Users\\User\\Desktop\\数据20191011\\*.html","C:\\Users\\User\\Desktop\\数据20191011_multi\\*.html"]
  17. paths = ["C:\\Users\\User\\Desktop\\数据20191014\\*.html"]
  18. number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十]{1,4}")
  19. target_dir = "C:\\Users\\User\\Desktop\\tofix1"
  20. list_articles = []
  21. _count1 = 0
  22. _count2 = 0
  23. MAX_COUNT = 500
  24. with codecs.open(target_dir+".txt","w",encoding="utf8") as f:
  25. for path in paths:
  26. files = glob.glob(path)
  27. for file in files:
  28. filename = file.split("/")[-1]
  29. _content = open(file,"r",encoding="utf8").read()
  30. _soup = BeautifulSoup(_content,"lxml")
  31. _text = _soup.get_text()
  32. _find = ""
  33. _set_pack = set()
  34. if _count1>=MAX_COUNT and _count2>=MAX_COUNT:
  35. break
  36. for _iter in re.finditer("(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))",_text):
  37. _find += "/"+_text[_iter.span()[0]-3:_iter.span()[1]+3]
  38. temp_package_number = re.findall(number_pattern,_text[_iter.span()[0]:_iter.span()[1]])[0]
  39. if temp_package_number!="":
  40. _set_pack.add(temp_package_number)
  41. if len(_set_pack)>1:
  42. print(file.split("\\")[-1],_find.replace("\n",""))
  43. f.write(file.split("\\")[-1]+" "+_find.replace("\n","")+"\n")
  44. shutil.copy(file,target_dir)
  45. if re.search("_52_",file) is not None:
  46. if _count1<MAX_COUNT:
  47. _count1 += 1
  48. _label = predict("12",_content)
  49. list_articles.append({"content":_content,"label":_label,"predict":_label,"filename":filename})
  50. if re.search("_101_",file) is not None:
  51. if _count2<MAX_COUNT:
  52. _count2 += 1
  53. _label = predict("12",_content)
  54. list_articles.append({"content":_content,"label":_label,"predict":_label,"filename":filename})
  55. save(list_articles, "../../dl_dev/money/traindata/article_label_1000_muti.pk")
  56. def getFile(filename):
  57. path = "C:\\Users\\User\\Desktop\\数据20191014\\"
  58. file = path+filename
  59. dest_dir = "C:\\Users\\User\\Desktop\\getfile"
  60. shutil.copy(file,dest_dir)
  61. if __name__=="__main__":
  62. #tofix()
  63. #getFile("比地_101_79888947.html")
  64. data = load("../../dl_dev/money/traindata/article_label_1000_muti.pk")
  65. data1 = load("label_0_1000.pk")
  66. for i in range(84):
  67. print(data[i]["filename"])
  68. print(data[i]["content"][:10])
  69. print(data1[i]["content"][:10])
  70. data[i]["label"] = data1[i]["label"]
  71. save(data, "../../dl_dev/money/traindata/article_label_1000_muti.pk")