123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596 |
- """
- Corpus preprocessing script
- Usage:
- preprocess.py [options]
- preprocess.py --split-in=<num-splits> --run-part=<num-part>
- preprocess.py --increment-ner
- preprocess.py -h | --help | --version
- Options:
- -h --help Show this screen
- --multiple-cores=<num-cores> Number of cores (use all to use every processor)
- --increment-ner Re run NER and Gazetter for every document. If a document lacked any of the previous steps, will be preprocessed entirely.
- --version Version number
- """
- import logging
- from docopt import docopt
- import os
- os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
- os.environ["CUDA_VISIBLE_DEVICES"] = ""
- import iepy
- import multiprocessing
- iepy.setup(__file__)
- from iepy.data.db import DocumentManager
- from iepy.selfpreprocess.self_preprocess import SelfPreprocesser
- from iepy.selfpreprocess.pipeline import PreProcessPipeline, PreProcessSteps
- # from iepy.preprocess.stanford_preprocess import StanfordPreprocess
- # from iepy.preprocess.pipeline import PreProcessPipeline, PreProcessSteps
- # from iepy.preprocess.segmenter import SyntacticSegmenterRunner
- class ParallelDocManager(DocumentManager):
- def mines_of(self, qset, number_of_processors, my_id):
- K = number_of_processors
- N = my_id
- clause = 'id %%%% %s = %s' % (K, N)
- return qset.extra(where=[clause])
- def start_preprocess(docs, increment_ner):
- pipeline = PreProcessPipeline([
- SelfPreprocesser(increment_ner),
- # SyntacticSegmenterRunner(increment=True)
- ], docs)
- pipeline.process_everything()
- if __name__ == '__main__':
- logger = logging.getLogger(u'preprocess')
- logger.setLevel(logging.INFO)
- logging.basicConfig(level=logging.INFO, format='%(message)s')
- opts = docopt(__doc__, version=iepy.__version__)
- increment_ner = opts['--increment-ner']
- dm = ParallelDocManager()
- all_docs = dm.get_documents_lacking_preprocess(
- [PreProcessSteps.brat])
- multiple_cores = opts.get('--multiple-cores')
- split_in = opts.get("--split-in")
- run_part = opts.get("--run-part")
- if multiple_cores:
- if multiple_cores == "all":
- multiple_cores = multiprocessing.cpu_count()
- try:
- multiple_cores = int(multiple_cores)
- except ValueError:
- logger.error("Invalid number of cores")
- exit(1)
- for i in range(multiple_cores):
- process = multiprocessing.Process(
- target=start_preprocess, args=(dm.mines_of(all_docs, multiple_cores, i), increment_ner)
- )
- process.start()
- elif split_in:
- try:
- split_in = int(split_in)
- run_part = int(run_part) - 1
- except ValueError:
- logger.error("Invalid split")
- exit(1)
- if run_part < 0 or run_part > split_in:
- logger.error("Parts must be between 1 and {}".format(split_in))
- exit(1)
- docs = dm.mines_of(all_docs, split_in, run_part)
- start_preprocess(docs, increment_ner)
- else:
- start_preprocess(all_docs, increment_ner)
|