preprocess.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. """
  2. Corpus preprocessing script
  3. Usage:
  4. preprocess.py [options]
  5. preprocess.py --split-in=<num-splits> --run-part=<num-part>
  6. preprocess.py --increment-ner
  7. preprocess.py -h | --help | --version
  8. Options:
  9. -h --help Show this screen
  10. --multiple-cores=<num-cores> Number of cores (use all to use every processor)
  11. --increment-ner Re run NER and Gazetter for every document. If a document lacked any of the previous steps, will be preprocessed entirely.
  12. --version Version number
  13. """
  14. import logging
  15. from docopt import docopt
  16. import os
  17. os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
  18. os.environ["CUDA_VISIBLE_DEVICES"] = ""
  19. import iepy
  20. import multiprocessing
  21. iepy.setup(__file__)
  22. from iepy.data.db import DocumentManager
  23. from iepy.selfpreprocess.self_preprocess import SelfPreprocesser
  24. from iepy.selfpreprocess.pipeline import PreProcessPipeline, PreProcessSteps
  25. # from iepy.preprocess.stanford_preprocess import StanfordPreprocess
  26. # from iepy.preprocess.pipeline import PreProcessPipeline, PreProcessSteps
  27. # from iepy.preprocess.segmenter import SyntacticSegmenterRunner
  28. class ParallelDocManager(DocumentManager):
  29. def mines_of(self, qset, number_of_processors, my_id):
  30. K = number_of_processors
  31. N = my_id
  32. clause = 'id %%%% %s = %s' % (K, N)
  33. return qset.extra(where=[clause])
  34. def start_preprocess(docs, increment_ner):
  35. pipeline = PreProcessPipeline([
  36. SelfPreprocesser(increment_ner),
  37. # SyntacticSegmenterRunner(increment=True)
  38. ], docs)
  39. pipeline.process_everything()
  40. if __name__ == '__main__':
  41. logger = logging.getLogger(u'preprocess')
  42. logger.setLevel(logging.INFO)
  43. logging.basicConfig(level=logging.INFO, format='%(message)s')
  44. opts = docopt(__doc__, version=iepy.__version__)
  45. increment_ner = opts['--increment-ner']
  46. dm = ParallelDocManager()
  47. all_docs = dm.get_documents_lacking_preprocess(
  48. [PreProcessSteps.sentencer,PreProcessSteps.tokenization])
  49. multiple_cores = opts.get('--multiple-cores')
  50. split_in = opts.get("--split-in")
  51. run_part = opts.get("--run-part")
  52. if multiple_cores:
  53. if multiple_cores == "all":
  54. multiple_cores = multiprocessing.cpu_count()
  55. try:
  56. multiple_cores = int(multiple_cores)
  57. except ValueError:
  58. logger.error("Invalid number of cores")
  59. exit(1)
  60. for i in range(multiple_cores):
  61. process = multiprocessing.Process(
  62. target=start_preprocess, args=(dm.mines_of(all_docs, multiple_cores, i), increment_ner)
  63. )
  64. process.start()
  65. elif split_in:
  66. try:
  67. split_in = int(split_in)
  68. run_part = int(run_part) - 1
  69. except ValueError:
  70. logger.error("Invalid split")
  71. exit(1)
  72. if run_part < 0 or run_part > split_in:
  73. logger.error("Parts must be between 1 and {}".format(split_in))
  74. exit(1)
  75. docs = dm.mines_of(all_docs, split_in, run_part)
  76. start_preprocess(docs, increment_ner)
  77. else:
  78. start_preprocess(all_docs, increment_ner)