fix.py 3.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. '''
  2. Created on 2019年10月12日
  3. @author: User
  4. '''
  5. import glob
  6. from bs4 import BeautifulSoup
  7. import re
  8. import shutil
  9. import codecs
  10. from BiddingKG.dl.test.test4 import predict
  11. from BiddingKG.dl.common.Utils import save, load
  12. def tofix():
  13. '''
  14. @summary: 获取需要fix的数据
  15. '''
  16. #paths = ["C:\\Users\\User\\Desktop\\数据20191011\\*.html","C:\\Users\\User\\Desktop\\数据20191011_multi\\*.html"]
  17. paths = ["C:\\Users\\User\\Desktop\\数据20191014\\*.html"]
  18. number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十]{1,4}")
  19. target_dir = "C:\\Users\\User\\Desktop\\tofix1"
  20. list_articles = []
  21. _count1 = 0
  22. _count2 = 0
  23. MAX_COUNT = 500
  24. with codecs.open(target_dir+".txt","w",encoding="utf8") as f:
  25. for path in paths:
  26. files = glob.glob(path)
  27. for file in files:
  28. filename = file.split("/")[-1]
  29. _content = open(file,"r",encoding="utf8").read()
  30. _soup = BeautifulSoup(_content,"lxml")
  31. _text = _soup.get_text()
  32. _find = ""
  33. _set_pack = set()
  34. if _count1>=MAX_COUNT and _count2>=MAX_COUNT:
  35. break
  36. for _iter in re.finditer("(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))",_text):
  37. _find += "/"+_text[_iter.span()[0]-3:_iter.span()[1]+3]
  38. temp_package_number = re.findall(number_pattern,_text[_iter.span()[0]:_iter.span()[1]])[0]
  39. if temp_package_number!="":
  40. _set_pack.add(temp_package_number)
  41. if len(_set_pack)>1:
  42. print(file.split("\\")[-1],_find.replace("\n",""))
  43. f.write(file.split("\\")[-1]+" "+_find.replace("\n","")+"\n")
  44. shutil.copy(file,target_dir)
  45. if re.search("_52_",file) is not None:
  46. if _count1<MAX_COUNT:
  47. _count1 += 1
  48. _label = predict("12",_content)
  49. list_articles.append({"content":_content,"label":_label,"predict":_label,"filename":filename})
  50. if re.search("_101_",file) is not None:
  51. if _count2<MAX_COUNT:
  52. _count2 += 1
  53. _label = predict("12",_content)
  54. list_articles.append({"content":_content,"label":_label,"predict":_label,"filename":filename})
  55. save(list_articles, "traindata/article_label_1000_muti.pk")
  56. def getFile(filename):
  57. path = "C:\\Users\\User\\Desktop\\数据20191014\\"
  58. file = path+filename
  59. dest_dir = "C:\\Users\\User\\Desktop\\getfile"
  60. shutil.copy(file,dest_dir)
  61. if __name__=="__main__":
  62. #tofix()
  63. #getFile("比地_101_79888947.html")
  64. data = load("traindata/article_label_1000_muti.pk")
  65. data1 = load("label_0_1000.pk")
  66. for i in range(84):
  67. print(data[i]["filename"])
  68. print(data[i]["content"][:10])
  69. print(data1[i]["content"][:10])
  70. data[i]["label"] = data1[i]["label"]
  71. save(data, "traindata/article_label_1000_muti.pk")