pipeline.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. import logging
  2. from enum import Enum
  3. logger = logging.getLogger(__name__)
  4. class PreProcessSteps(Enum):
  5. # numbers do not imply order
  6. tokenization = 1
  7. lemmatization = 6
  8. sentencer = 2
  9. tagging = 3
  10. ner = 4
  11. segmentation = 5
  12. syntactic_parsing = 7
  13. brat = 8
  14. class PreProcessPipeline(object):
  15. """Coordinates the pre-processing tasks on a set of documents"""
  16. def __init__(self, step_runners, documents_manager):
  17. """Takes a list of callables and a documents-manager.
  18. Step Runners may be any callable. It they have an attribute step,
  19. then that runner will be treated as the responsible for
  20. accomplishing such a PreProcessStep.
  21. """
  22. from iepy.data.db import DocumentManager # circular imports safety
  23. self.step_runners = step_runners
  24. if not isinstance(documents_manager, DocumentManager):
  25. documents_manager = DocumentManager(documents_manager)
  26. self.documents = documents_manager
  27. def walk_document(self, doc):
  28. """Computes all the missing pre-process steps for the given document"""
  29. for step in self.step_runners:
  30. step(doc)
  31. return
  32. def process_step_in_batch(self, runner):
  33. """Tries to apply the required step to all documents lacking it"""
  34. logger.info('Starting preprocessing step %s', runner)
  35. if hasattr(runner, 'step') and not (runner.override or runner.increment):
  36. docs = self.documents.get_documents_lacking_preprocess(runner.step)
  37. else:
  38. docs = self.documents # everything
  39. for i, doc in enumerate(docs):
  40. runner(doc)
  41. logger.info('\tDone for %i documents', i + 1)
  42. def process_everything(self):
  43. """Tries to apply all the steps to all documents"""
  44. for runner in self.step_runners:
  45. self.process_step_in_batch(runner)
  46. class BasePreProcessStepRunner(object):
  47. # If it's for a particular step, you can write
  48. # step = PreProcessSteps.something
  49. def __init__(self, override=False, increment=False):
  50. self.override = override
  51. self.increment = increment
  52. def __call__(self, doc):
  53. # You'll have to:
  54. # - Check if the document satisfies pre-conditions, and if not, do nothing
  55. # - Explicitely store pre process results on the document
  56. # - Based on the "override" paramenter, and on your checks to see if the step
  57. # was already done or not, decide if you will
  58. # - skip
  59. # - re-do step.
  60. raise NotImplementedError