il y a 5 ans · ec7973e324
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,3 @@
 
															+*.pyc
														
 
															+iepy.egg-info
														
 
															+/.idea/
														
--- a/AUTHORS
+++ b/AUTHORS
@@ -0,0 +1,14 @@
 
															+Authors
														
 
															+
														
 
															+- Rafael Carrascosa <rcarrascosa@machinalis.com> (rafacarrascosa at github)
														
 
															+- Javier Mansilla <jmansilla@machinalis.com> (jmansilla at github)
														
 
															+- Gonzalo García Berrotarán <ggarcia@machinalis.com> (j0hn at github)
														
 
															+- Daniel Moisset <dmoisset@machinalis.com> (dmoisset at github)
														
 
															+- Franco M. Luque <francolq@famaf.unc.edu.ar> (francolq at github)
														
 
															+
														
 
															+Contributors
														
 
															+
														
 
															+- Marcos Spontón (msponton@machinalis.com)
														
 
															+- Laura Alonso i Alemany (lalonsoialemany@machinalis.com)
														
 
															+- Patricio Del Boca (pdelboca@machinalis.com)
														
 
															+- Elías Andrawos (eandrawos@machinalis.com)
														
--- a/ChangeLog
+++ b/ChangeLog
@@ -0,0 +1,44 @@
 
															+0.9.6
														
 
															+    - Fixed some dependencies declarations to provide support for python 3.5
														
 
															+    - Bug fix respect to active learning predictions
														
 
															+    - Added support for German preprocess (thanks @sweh)
														
 
															+
														
 
															+0.9.5
														
 
															+    - Bug fix on TokenizerSentencerRunner (thanks ezesalta)
														
 
															+    - Fix on installation dependencies
														
 
															+    - Tokenization options can be handled from instance settings file
														
 
															+
														
 
															+0.9.4
														
 
															+    - Added multicore preprocess
														
 
															+    - Added support for Stanford 3.5.2 preprocess models
														
 
															+
														
 
															+0.9.3
														
 
															+    - Added grammatical parsing to the preprocess flow of documents
														
 
															+    - Added support for Spanish preprocess
														
 
															+    - Restricted each iepy-instance to a single language
														
 
															+    - Gazetter support
														
 
															+    - Labeling UI improvements
														
 
															+    - Performance and memory usage improvements
														
 
															+    - Model simplifications (labels, metadata)
														
 
															+    - Storage & view of predictions
														
 
															+
														
 
															+0.9.2
														
 
															+    - Add ability to use custom features (http://iepy.rtfd.org/en/latest/how_to_hack.html#implementing-your-own-features)
														
 
															+    - Add ability to use rules as features (http://iepy.rtfd.org/en/latest/how_to_hack.html#using-rules-as-features)
														
 
															+    - Add rules verifier (http://iepy.rtfd.org/en/latest/rules_tutorial.html#verifying-your-rules)
														
 
															+    - Fixed bugs of compatibility with firefox [thanks dchaplinsky for the bug report]
														
 
															+    - Skip instead of crashing when a document could not be loaded via csv importer [thanks dchaplinsky for the report and suggestion]
														
 
															+    - Performance improvement on rules runner
														
 
															+    - Change instance files schema, now it's a python package and renamed settings.
														
 
															+    - Add lemmatization to the pre-process (http://iepy.rtfd.org/en/latest/preprocess.html#lemmatization)
														
 
															+    - Fix critical bug on loading rules
														
 
															+    - Fix critical bug on ranking questions on the active learning extraction runner
														
 
															+
														
 
															+0.9.1
														
 
															+    - Add entity kind on the modal dialog
														
 
															+    - Change arrows display to be more understandable
														
 
															+    - Join skip and don't know label options
														
 
															+    - Change options dropdown for radio buttons
														
 
															+    - Show help for shortcuts and change the order of the options
														
 
															+    - Documents rich view (without needing to be labeling the document for some relation)
														
 
															+    - instance upgrader
														
--- a/LICENSE
+++ b/LICENSE
@@ -0,0 +1,27 @@
 
															+Copyright (c) Machinalis and individual contributors.
														
 
															+All rights reserved.
														
 
															+
														
 
															+Redistribution and use in source and binary forms, with or without modification,
														
 
															+are permitted provided that the following conditions are met:
														
 
															+
														
 
															+    1. Redistributions of source code must retain the above copyright notice,
														
 
															+       this list of conditions and the following disclaimer.
														
 
															+
														
 
															+    2. Redistributions in binary form must reproduce the above copyright
														
 
															+       notice, this list of conditions and the following disclaimer in the
														
 
															+       documentation and/or other materials provided with the distribution.
														
 
															+
														
 
															+    3. Neither the name of Machinalis nor the names of its contributors may be
														
 
															+       used to endorse or promote products derived from this software without
														
 
															+       specific prior written permission.
														
 
															+
														
 
															+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
														
 
															+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
														
 
															+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
														
 
															+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
														
 
															+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
														
 
															+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
														
 
															+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
														
 
															+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
														
 
															+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
														
 
															+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
														
--- a/LICENSE_details.txt
+++ b/LICENSE_details.txt
@@ -0,0 +1,30 @@
 
															+The python libraries (and their licenses) that we are explicitly depending on,
														
 
															+are the following ones,
														
 
															+
														
 
															+    - nltk (Apache License)
														
 
															+    - numpy (BSD)
														
 
															+    - scipy (BSD)
														
 
															+    - scikit-learn (BSD)
														
 
															+    - mock (BSD)
														
 
															+    - docopt (MIT)
														
 
															+    - future (MIT)
														
 
															+    - appdirs (MIT)
														
 
															+    - wget (Public Domain)
														
 
															+    - colorama (BSD)
														
 
															+    - featureforge (BSD)
														
 
															+
														
 
															+The development tools we are using:
														
 
															+
														
 
															+    - nose (LGPL)
														
 
															+    - factory-boy (MIT)
														
 
															+
														
 
															+Additionally, in order to be able to create your own iepy-ready corpus with our
														
 
															+preprocessing tools, you'll need to download the following things that are not
														
 
															+provided by this software
														
 
															+
														
 
															+    - punkt tokenizer (acquirable with the NLTK downloader or the
														
 
															+                       download_third_party_data script)
														
 
															+    - wordnet (acquirable with the NLTK downloader or the
														
 
															+               download_third_party_data script)
														
 
															+    - GPL Stanford CoreNLP (acquirable with download_third_party_data script)
														
 
															+    - GPL Stanford Spanish Models (acquirable with download_third_party_data script)
														
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -0,0 +1,15 @@
 
															+include README.rst
														
 
															+include AUTHORS
														
 
															+include LICENSE
														
 
															+include MANIFEST.in
														
 
															+include ChangeLog
														
 
															+include iepy/version.txt
														
 
															+
														
 
															+recursive-include iepy/instantiation *.template
														
 
															+recursive-include iepy/preprocess/utils *.jar
														
 
															+recursive-include iepy/webui/corpus/static *
														
 
															+recursive-include iepy/webui/corpus/templates *
														
 
															+recursive-include docs/setup requirements*.txt
														
 
															+
														
 
															+recursive-exclude * __pycache__
														
 
															+recursive-exclude * *.py[co]
														
--- a/README.rst
+++ b/README.rst
@@ -0,0 +1,88 @@
 
															+IEPY
														
 
															+====
														
 
															+
														
 
															+IEPY is an open source tool for
														
 
															+`Information Extraction <http://en.wikipedia.org/wiki/Information_extraction>`_
														
 
															+focused on Relation Extraction.
														
 
															+
														
 
															+To give an example of Relation Extraction, if we are trying to find a
														
 
															+birth date in:
														
 
															+
														
 
															+    `"John von Neumann (December 28, 1903 – February 8, 1957) was a Hungarian and
														
 
															+    American pure and applied mathematician, physicist, inventor and polymath."`
														
 
															+
														
 
															+then IEPY's task is to identify "``John von Neumann``" and
														
 
															+"``December 28, 1903``" as the subject and object entities of the "``was born in``"
														
 
															+relation.
														
 
															+
														
 
															+It's aimed at:
														
 
															+    - `users <http://iepy.readthedocs.org/en/latest/active_learning_tutorial.html>`_
														
 
															+      needing to perform Information Extraction on a large dataset.
														
 
															+    - `scientists <http://iepy.readthedocs.org/en/latest/how_to_hack.html>`_
														
 
															+      wanting to experiment with new IE algorithms.
														
 
															+
														
 
															+Features
														
 
															+--------
														
 
															+
														
 
															+    - `A corpus annotation tool <http://iepy.readthedocs.org/en/latest/corpus_labeling.html>`_
														
 
															+      with a `web-based UI <http://iepy.readthedocs.org/en/latest/corpus_labeling.html#document-based-labeling>`_
														
 
															+    - `An active learning relation extraction tool <http://iepy.readthedocs.org/en/latest/active_learning_tutorial.html>`_
														
 
															+      pre-configured with convenient defaults.
														
 
															+    - `A rule based relation extraction tool <http://iepy.readthedocs.org/en/latest/rules_tutorial.html>`_
														
 
															+      for cases where the documents are semi-structured or high precision is required.
														
 
															+    - A web-based user interface that:
														
 
															+        - Allows layman users to control some aspects of IEPY.
														
 
															+        - Allows decentralization of human input.
														
 
															+    - A shallow entity ontology with coreference resolution via `Stanford CoreNLP <http://nlp.stanford.edu/software/corenlp.shtml>`_
														
 
															+    - `An easily hack-able active learning core <http://iepy.readthedocs.org/en/latest/how_to_hack.html>`_,
														
 
															+      ideal for scientist wanting to experiment with new algorithms.
														
 
															+
														
 
															+Installation
														
 
															+------------
														
 
															+
														
 
															+Install the required packages:
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    sudo apt-get install build-essential python3-dev liblapack-dev libatlas-dev gfortran openjdk-7-jre
														
 
															+
														
 
															+Then simply install with **pip**:
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    pip install iepy
														
 
															+
														
 
															+Full details about the installation is available on the
														
 
															+`Read the Docs <http://iepy.readthedocs.org/en/latest/installation.html>`__ page.
														
 
															+
														
 
															+Running the tests
														
 
															+-----------------
														
 
															+
														
 
															+If you are contributing to the project and want to run the tests, all you have to do is:
														
 
															+
														
 
															+    - Make sure your JAVAHOME is correctly set. `Read more about it here <http://iepy.readthedocs.io/en/latest/installation.html#install-iepy-package>`_
														
 
															+    - In the root of the project run `nosetests`
														
 
															+
														
 
															+Learn more
														
 
															+----------
														
 
															+
														
 
															+The full documentation is available on `Read the Docs <http://iepy.readthedocs.org/en/latest/>`__.
														
 
															+
														
 
															+
														
 
															+Authors
														
 
															+-------
														
 
															+
														
 
															+IEPY is © 2014 `Machinalis <http://www.machinalis.com/>`_ in collaboration
														
 
															+with the `NLP Group at UNC-FaMAF <http://pln.famaf.unc.edu.ar/>`_. Its primary
														
 
															+authors are:
														
 
															+
														
 
															+ * Rafael Carrascosa <rcarrascosa@machinalis.com> (rafacarrascosa at github)
														
 
															+ * Javier Mansilla <jmansilla@machinalis.com> (jmansilla at github)
														
 
															+ * Gonzalo García Berrotarán <ggarcia@machinalis.com> (j0hn at github)
														
 
															+ * Franco M. Luque <francolq@famaf.unc.edu.ar> (francolq at github)
														
 
															+ * Daniel Moisset <dmoisset@machinalis.com> (dmoisset at github)
														
 
															+
														
 
															+You can follow the development of this project and report issues at
														
 
															+http://github.com/machinalis/iepy
														
 
															+
														
 
															+You can join the mailing list `here <https://groups.google.com/forum/?hl=es-419#%21forum/iepy>`__
														
--- a/docs/Changelog
+++ b/docs/Changelog
@@ -0,0 +1 @@
 
															+../ChangeLog
														
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -0,0 +1,153 @@
 
															+# Makefile for Sphinx documentation
														
 
															+#
														
 
															+
														
 
															+# You can set these variables from the command line.
														
 
															+SPHINXOPTS    =
														
 
															+SPHINXBUILD   = sphinx-build
														
 
															+PAPER         =
														
 
															+BUILDDIR      = _build
														
 
															+
														
 
															+# Internal variables.
														
 
															+PAPEROPT_a4     = -D latex_paper_size=a4
														
 
															+PAPEROPT_letter = -D latex_paper_size=letter
														
 
															+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
														
 
															+# the i18n builder cannot share the environment and doctrees with the others
														
 
															+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
														
 
															+
														
 
															+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
														
 
															+
														
 
															+help:
														
 
															+	@echo "Please use \`make <target>' where <target> is one of"
														
 
															+	@echo "  html       to make standalone HTML files"
														
 
															+	@echo "  dirhtml    to make HTML files named index.html in directories"
														
 
															+	@echo "  singlehtml to make a single large HTML file"
														
 
															+	@echo "  pickle     to make pickle files"
														
 
															+	@echo "  json       to make JSON files"
														
 
															+	@echo "  htmlhelp   to make HTML files and a HTML help project"
														
 
															+	@echo "  qthelp     to make HTML files and a qthelp project"
														
 
															+	@echo "  devhelp    to make HTML files and a Devhelp project"
														
 
															+	@echo "  epub       to make an epub"
														
 
															+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
														
 
															+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
														
 
															+	@echo "  text       to make text files"
														
 
															+	@echo "  man        to make manual pages"
														
 
															+	@echo "  texinfo    to make Texinfo files"
														
 
															+	@echo "  info       to make Texinfo files and run them through makeinfo"
														
 
															+	@echo "  gettext    to make PO message catalogs"
														
 
															+	@echo "  changes    to make an overview of all changed/added/deprecated items"
														
 
															+	@echo "  linkcheck  to check all external links for integrity"
														
 
															+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
														
 
															+
														
 
															+clean:
														
 
															+	-rm -rf $(BUILDDIR)/*
														
 
															+
														
 
															+html:
														
 
															+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
														
 
															+	@echo
														
 
															+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
														
 
															+
														
 
															+dirhtml:
														
 
															+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
														
 
															+	@echo
														
 
															+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
														
 
															+
														
 
															+singlehtml:
														
 
															+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
														
 
															+	@echo
														
 
															+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
														
 
															+
														
 
															+pickle:
														
 
															+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
														
 
															+	@echo
														
 
															+	@echo "Build finished; now you can process the pickle files."
														
 
															+
														
 
															+json:
														
 
															+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
														
 
															+	@echo
														
 
															+	@echo "Build finished; now you can process the JSON files."
														
 
															+
														
 
															+htmlhelp:
														
 
															+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
														
 
															+	@echo
														
 
															+	@echo "Build finished; now you can run HTML Help Workshop with the" \
														
 
															+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
														
 
															+
														
 
															+qthelp:
														
 
															+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
														
 
															+	@echo
														
 
															+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
														
 
															+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
														
 
															+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/IEPY.qhcp"
														
 
															+	@echo "To view the help file:"
														
 
															+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/IEPY.qhc"
														
 
															+
														
 
															+devhelp:
														
 
															+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
														
 
															+	@echo
														
 
															+	@echo "Build finished."
														
 
															+	@echo "To view the help file:"
														
 
															+	@echo "# mkdir -p $$HOME/.local/share/devhelp/IEPY"
														
 
															+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/IEPY"
														
 
															+	@echo "# devhelp"
														
 
															+
														
 
															+epub:
														
 
															+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
														
 
															+	@echo
														
 
															+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
														
 
															+
														
 
															+latex:
														
 
															+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
														
 
															+	@echo
														
 
															+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
														
 
															+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
														
 
															+	      "(use \`make latexpdf' here to do that automatically)."
														
 
															+
														
 
															+latexpdf:
														
 
															+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
														
 
															+	@echo "Running LaTeX files through pdflatex..."
														
 
															+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
														
 
															+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
														
 
															+
														
 
															+text:
														
 
															+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
														
 
															+	@echo
														
 
															+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
														
 
															+
														
 
															+man:
														
 
															+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
														
 
															+	@echo
														
 
															+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
														
 
															+
														
 
															+texinfo:
														
 
															+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
														
 
															+	@echo
														
 
															+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
														
 
															+	@echo "Run \`make' in that directory to run these through makeinfo" \
														
 
															+	      "(use \`make info' here to do that automatically)."
														
 
															+
														
 
															+info:
														
 
															+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
														
 
															+	@echo "Running Texinfo files through makeinfo..."
														
 
															+	make -C $(BUILDDIR)/texinfo info
														
 
															+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
														
 
															+
														
 
															+gettext:
														
 
															+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
														
 
															+	@echo
														
 
															+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
														
 
															+
														
 
															+changes:
														
 
															+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
														
 
															+	@echo
														
 
															+	@echo "The overview file is in $(BUILDDIR)/changes."
														
 
															+
														
 
															+linkcheck:
														
 
															+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
														
 
															+	@echo
														
 
															+	@echo "Link check complete; look for any errors in the above output " \
														
 
															+	      "or in $(BUILDDIR)/linkcheck/output.txt."
														
 
															+
														
 
															+doctest:
														
 
															+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
														
 
															+	@echo "Testing of doctests in the sources finished, look at the " \
														
 
															+	      "results in $(BUILDDIR)/doctest/output.txt."
														
--- a/docs/active_learning_tutorial.rst
+++ b/docs/active_learning_tutorial.rst
@@ -0,0 +1,151 @@
 
															+Running the active learning core
														
 
															+================================
														
 
															+
														
 
															+The active learning core works by trying to predict the relations using information provided by the user.
														
 
															+This means you'll have to label some of the examples and based on those, the core will infer the rest.
														
 
															+The core will also give you to label the more important examples (those which best helps
														
 
															+to figure out the other cases).
														
 
															+
														
 
															+To start using it you'll need to define a relation, run the core, label some evidence and re-run the core loop.
														
 
															+You can also label evidences and re-run the core as much as you like to have a better performance.
														
 
															+
														
 
															+Creating a relation
														
 
															+-------------------
														
 
															+
														
 
															+To create a relation, first `open up the web server <tutorial.html#open-the-web-interface>`__ if you haven't already, and use a
														
 
															+web browser to navigate on `http://127.0.0.1:8000 <http://127.0.0.1:8000>`_.
														
 
															+There you'll find instructions on how to create a relation.
														
 
															+
														
 
															+Running the core
														
 
															+----------------
														
 
															+
														
 
															+After creating a relation, you can start the core to look for instances of that relation.
														
 
															+
														
 
															+You can run this core in two modes: **High precision** or **high recall**.
														
 
															+`Precision and recall <http://en.wikipedia.org/wiki/Precision_and_recall>`_ can be traded with one another up to a certain point.  I.e. it is possible to trade some
														
 
															+recall to get better precision and vice versa.
														
 
															+
														
 
															+To visualize better this trade off, lets see an example:
														
 
															+A precision of 99% means that 1 of every 100 predicted relations will be wrong and the rest will be correct.
														
 
															+A recall of 30% means that only 30 out of 100 existent relations will be detected by the algorithm and the rest
														
 
															+will be wrongly discarded as "no relation present".
														
 
															+
														
 
															+Run the active learning core by doing:
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    python bin/iepy_runner.py <relation_name> <output>
														
 
															+
														
 
															+And add ``--tune-for=high-prec`` or ``--tune-for=high-recall`` before the relation name to switch
														
 
															+between modes. The default is **high precision**.
														
 
															+
														
 
															+This will run until it needs you to label some of the evidences. At this point, what you
														
 
															+need to do is go to the web interface that you ran on the previous step, and there you
														
 
															+can label some evidences.
														
 
															+
														
 
															+When you consider that is enough, on the prompt that the iepy runner presented you,
														
 
															+continue the execution by typing **run**.
														
 
															+
														
 
															+That will cycle again and repeat the process.
														
 
															+
														
 
															+Run the active learning core in the command line and ask it to **STOP**.
														
 
															+It'll save a csv with the automatic classifications for all evidences in the database.
														
 
															+
														
 
															+Also, note that you can only predict a relation for a text that has been inserted into the database.
														
 
															+The csv output file has the primary key of an object in the database that represents the evidence that 
														
 
															+was classified as "relation present" or "relation not present". An evidence object in the database is a
														
 
															+rich-in-information object containing the entities and circumstances surrounding the prediction that 
														
 
															+is too complex to put in a single csv file.
														
 
															+
														
 
															+In order to access the entities and other details you'll need to write a script 
														
 
															+to talk with the database (see iepy/data/models.py).
														
 
															+
														
 
															+
														
 
															+Fine tuning
														
 
															+-----------
														
 
															+
														
 
															+If you want to modify the internal behavior, you can change the settings file. On your instance
														
 
															+folder you'll fine a file called ``extractor_config.json``. There you've all the configuration
														
 
															+for the internal classifier, such as:
														
 
															+
														
 
															+Classifier
														
 
															+..........
														
 
															+
														
 
															+This sets the classifier algorithm to be used, you can choose from:
														
 
															+
														
 
															+    * sgd: `Stochastic Gradient Descent <http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html>`_
														
 
															+    * knn: `Nearest Neighbors <http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier>`_
														
 
															+    * svc `(default)`: `C-Support Vector Classification <http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html>`_
														
 
															+    * randomforest: `Random Forest <http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html>`_
														
 
															+    * adaboost: `AdaBoost <http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html>`_
														
 
															+
														
 
															+Features
														
 
															+........
														
 
															+
														
 
															+Features to be used in the classifier, you can use a subset of:
														
 
															+
														
 
															+    * number_of_tokens
														
 
															+    * symbols_in_between
														
 
															+    * in_same_sentence
														
 
															+    * verbs_count
														
 
															+    * verbs_count_in_between
														
 
															+    * total_number_of_entities
														
 
															+    * other_entities_in_between
														
 
															+    * entity_distance
														
 
															+    * entity_order
														
 
															+    * bag_of_wordpos_bigrams_in_between
														
 
															+    * bag_of_wordpos_in_between
														
 
															+    * bag_of_word_bigrams_in_between
														
 
															+    * bag_of_pos_in_between
														
 
															+    * bag_of_words_in_between
														
 
															+    * bag_of_wordpos_bigrams
														
 
															+    * bag_of_wordpos
														
 
															+    * bag_of_word_bigrams
														
 
															+    * bag_of_pos
														
 
															+    * bag_of_words
														
 
															+
														
 
															+These can be added as `sparse` adding them into the
														
 
															+`sparse_features` section or added as `dense` into the `dense_features`.
														
 
															+
														
 
															+The features in the sparse section will go through a stage of linear dimension reduction
														
 
															+and the dense features, by default, will be used with a non-linear classifier.
														
 
															+
														
 
															+
														
 
															+Viewing predictions on the web user interface
														
 
															+---------------------------------------------
														
 
															+
														
 
															+If you prefer to review the predictions using the web interface is possible to run the
														
 
															+active learning core in a way that stores the results on the database and they are accesible
														
 
															+through the web.
														
 
															+
														
 
															+To do so, you'll have to run the core like this:
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    python bin/iepy_runner.py --db-store <relation_name> 
														
 
															+
														
 
															+We do not have an specialized interface to review predictions but you can still view them
														
 
															+by using the :doc:`interface to create a reference corpus <corpus_labeling>`.
														
 
															+
														
 
															+This way, you'll get labels as a new **judge** called iepy-run and a date.
														
 
															+
														
 
															+.. image:: labels_by_iepy.png
														
 
															+
														
 
															+
														
 
															+Saving predictor for later use
														
 
															+------------------------------
														
 
															+
														
 
															+Since training could be a slow process, you might want to save your trained predictor and
														
 
															+re-use it several times without the need to train again.
														
 
															+
														
 
															+You can save it this by doing:
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    python bin/iepy_runner.py --store-extractor=myextractor.pickle <relation_name> <output>
														
 
															+
														
 
															+And re use it like this:
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    python bin/iepy_runner.py --trained-extractor=myextractor.pickle <relation_name> <output>
														
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -0,0 +1,244 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+#
														
 
															+# IEPY documentation build configuration file, created by
														
 
															+# sphinx-quickstart on Wed Apr 23 20:02:15 2014.
														
 
															+#
														
 
															+# This file is execfile()d with the current directory set to its containing dir.
														
 
															+#
														
 
															+# Note that not all possible configuration values are present in this
														
 
															+# autogenerated file.
														
 
															+#
														
 
															+# All configuration values have a default; values that are commented out
														
 
															+# serve to show the default.
														
 
															+
														
 
															+import sys, os
														
 
															+
														
 
															+AUTHORS = (u'Rafael Carrascosa, Javier Mansilla, Gonzalo García Berrotarán, '
														
 
															+           'Daniel Moisset, Franco M. Luque')
														
 
															+
														
 
															+# If extensions (or modules to document with autodoc) are in another directory,
														
 
															+# add these directories to sys.path here. If the directory is relative to the
														
 
															+# documentation root, use os.path.abspath to make it absolute, like shown here.
														
 
															+#sys.path.insert(0, os.path.abspath('.'))
														
 
															+
														
 
															+# -- General configuration -----------------------------------------------------
														
 
															+
														
 
															+# If your documentation needs a minimal Sphinx version, state it here.
														
 
															+#needs_sphinx = '1.0'
														
 
															+
														
 
															+# Add any Sphinx extension module names here, as strings. They can be extensions
														
 
															+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
														
 
															+extensions = ['sphinx.ext.viewcode']
														
 
															+
														
 
															+# Add any paths that contain templates here, relative to this directory.
														
 
															+templates_path = ['_templates']
														
 
															+
														
 
															+# The suffix of source filenames.
														
 
															+source_suffix = '.rst'
														
 
															+
														
 
															+# The encoding of source files.
														
 
															+#source_encoding = 'utf-8-sig'
														
 
															+
														
 
															+# The master toctree document.
														
 
															+master_doc = 'index'
														
 
															+
														
 
															+# General information about the project.
														
 
															+project = u'IEPY'
														
 
															+copyright = u'2014, ' + AUTHORS
														
 
															+
														
 
															+# The version info for the project you're documenting, acts as replacement for
														
 
															+# |version| and |release|, also used in various other places throughout the
														
 
															+# built documents.
														
 
															+#
														
 
															+# The short X.Y version.
														
 
															+from os import path as _p
														
 
															+with open(_p.join(_p.dirname(_p.abspath(__file__)), '..', 'iepy', 'version.txt')) as vfile:
														
 
															+    version = vfile.readline().strip()
														
 
															+# The full version, including alpha/beta/rc tags.
														
 
															+release = version
														
 
															+
														
 
															+# The language for content autogenerated by Sphinx. Refer to documentation
														
 
															+# for a list of supported languages.
														
 
															+#language = None
														
 
															+
														
 
															+# There are two options for replacing |today|: either, you set today to some
														
 
															+# non-false value, then it is used:
														
 
															+#today = ''
														
 
															+# Else, today_fmt is used as the format for a strftime call.
														
 
															+#today_fmt = '%B %d, %Y'
														
 
															+
														
 
															+# List of patterns, relative to source directory, that match files and
														
 
															+# directories to ignore when looking for source files.
														
 
															+exclude_patterns = ['_build']
														
 
															+
														
 
															+# The reST default role (used for this markup: `text`) to use for all documents.
														
 
															+#default_role = None
														
 
															+
														
 
															+# If true, '()' will be appended to :func: etc. cross-reference text.
														
 
															+#add_function_parentheses = True
														
 
															+
														
 
															+# If true, the current module name will be prepended to all description
														
 
															+# unit titles (such as .. function::).
														
 
															+#add_module_names = True
														
 
															+
														
 
															+# If true, sectionauthor and moduleauthor directives will be shown in the
														
 
															+# output. They are ignored by default.
														
 
															+#show_authors = False
														
 
															+
														
 
															+# The name of the Pygments (syntax highlighting) style to use.
														
 
															+pygments_style = 'sphinx'
														
 
															+
														
 
															+# A list of ignored prefixes for module index sorting.
														
 
															+#modindex_common_prefix = []
														
 
															+
														
 
															+
														
 
															+# -- Options for HTML output ---------------------------------------------------
														
 
															+
														
 
															+# The theme to use for HTML and HTML Help pages.  See the documentation for
														
 
															+# a list of builtin themes.
														
 
															+html_theme = "sphinx_rtd_theme"
														
 
															+
														
 
															+# Theme options are theme-specific and customize the look and feel of a theme
														
 
															+# further.  For a list of options available for each theme, see the
														
 
															+# documentation.
														
 
															+#html_theme_options = {}
														
 
															+
														
 
															+# Add any paths that contain custom themes here, relative to this directory.
														
 
															+html_theme_path = [os.getenv('VIRTUAL_ENV', '') + '/lib/python3.4/site-packages']
														
 
															+
														
 
															+# The name for this set of Sphinx documents.  If None, it defaults to
														
 
															+# "<project> v<release> documentation".
														
 
															+#html_title = None
														
 
															+
														
 
															+# A shorter title for the navigation bar.  Default is the same as html_title.
														
 
															+#html_short_title = None
														
 
															+
														
 
															+# The name of an image file (relative to this directory) to place at the top
														
 
															+# of the sidebar.
														
 
															+#html_logo = None
														
 
															+
														
 
															+# The name of an image file (within the static path) to use as favicon of the
														
 
															+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
														
 
															+# pixels large.
														
 
															+#html_favicon = None
														
 
															+
														
 
															+# Add any paths that contain custom static files (such as style sheets) here,
														
 
															+# relative to this directory. They are copied after the builtin static files,
														
 
															+# so a file named "default.css" will overwrite the builtin "default.css".
														
 
															+html_static_path = ['_static']
														
 
															+
														
 
															+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
														
 
															+# using the given strftime format.
														
 
															+#html_last_updated_fmt = '%b %d, %Y'
														
 
															+
														
 
															+# If true, SmartyPants will be used to convert quotes and dashes to
														
 
															+# typographically correct entities.
														
 
															+#html_use_smartypants = True
														
 
															+
														
 
															+# Custom sidebar templates, maps document names to template names.
														
 
															+#html_sidebars = {}
														
 
															+
														
 
															+# Additional templates that should be rendered to pages, maps page names to
														
 
															+# template names.
														
 
															+#html_additional_pages = {}
														
 
															+
														
 
															+# If false, no module index is generated.
														
 
															+#html_domain_indices = True
														
 
															+
														
 
															+# If false, no index is generated.
														
 
															+#html_use_index = True
														
 
															+
														
 
															+# If true, the index is split into individual pages for each letter.
														
 
															+#html_split_index = False
														
 
															+
														
 
															+# If true, links to the reST sources are added to the pages.
														
 
															+#html_show_sourcelink = True
														
 
															+
														
 
															+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
														
 
															+#html_show_sphinx = True
														
 
															+
														
 
															+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
														
 
															+#html_show_copyright = True
														
 
															+
														
 
															+# If true, an OpenSearch description file will be output, and all pages will
														
 
															+# contain a <link> tag referring to it.  The value of this option must be the
														
 
															+# base URL from which the finished HTML is served.
														
 
															+#html_use_opensearch = ''
														
 
															+
														
 
															+# This is the file name suffix for HTML files (e.g. ".xhtml").
														
 
															+#html_file_suffix = None
														
 
															+
														
 
															+# Output file base name for HTML help builder.
														
 
															+htmlhelp_basename = 'IEPYdoc'
														
 
															+
														
 
															+
														
 
															+# -- Options for LaTeX output --------------------------------------------------
														
 
															+
														
 
															+latex_elements = {
														
 
															+# The paper size ('letterpaper' or 'a4paper').
														
 
															+#'papersize': 'letterpaper',
														
 
															+
														
 
															+# The font size ('10pt', '11pt' or '12pt').
														
 
															+#'pointsize': '10pt',
														
 
															+
														
 
															+# Additional stuff for the LaTeX preamble.
														
 
															+#'preamble': '',
														
 
															+}
														
 
															+
														
 
															+# Grouping the document tree into LaTeX files. List of tuples
														
 
															+# (source start file, target name, title, author, documentclass [howto/manual]).
														
 
															+latex_documents = [
														
 
															+    ('index', 'IEPY.tex', u'IEPY Documentation', AUTHORS, 'manual'),
														
 
															+]
														
 
															+
														
 
															+# The name of an image file (relative to this directory) to place at the top of
														
 
															+# the title page.
														
 
															+#latex_logo = None
														
 
															+
														
 
															+# For "manual" documents, if this is true, then toplevel headings are parts,
														
 
															+# not chapters.
														
 
															+#latex_use_parts = False
														
 
															+
														
 
															+# If true, show page references after internal links.
														
 
															+#latex_show_pagerefs = False
														
 
															+
														
 
															+# If true, show URL addresses after external links.
														
 
															+#latex_show_urls = False
														
 
															+
														
 
															+# Documents to append as an appendix to all manuals.
														
 
															+#latex_appendices = []
														
 
															+
														
 
															+# If false, no module index is generated.
														
 
															+#latex_domain_indices = True
														
 
															+
														
 
															+
														
 
															+# -- Options for manual page output --------------------------------------------
														
 
															+
														
 
															+# One entry per manual page. List of tuples
														
 
															+# (source start file, name, description, authors, manual section).
														
 
															+man_pages = [
														
 
															+    ('index', 'iepy', u'IEPY Documentation', [AUTHORS], 1)
														
 
															+]
														
 
															+
														
 
															+# If true, show URL addresses after external links.
														
 
															+#man_show_urls = False
														
 
															+
														
 
															+
														
 
															+# -- Options for Texinfo output ------------------------------------------------
														
 
															+
														
 
															+# Grouping the document tree into Texinfo files. List of tuples
														
 
															+# (source start file, target name, title, author,
														
 
															+#  dir menu entry, description, category)
														
 
															+texinfo_documents = [
														
 
															+    ('index', 'IEPY', u'IEPY Documentation', AUTHORS, 'IEPY',
														
 
															+     'Information Extraction python library.', 'Miscellaneous'),
														
 
															+]
														
 
															+
														
 
															+# Documents to append as an appendix to all manuals.
														
 
															+#texinfo_appendices = []
														
 
															+
														
 
															+# If false, no module index is generated.
														
 
															+#texinfo_domain_indices = True
														
 
															+
														
 
															+# How to display URL addresses: 'footnote', 'no', or 'inline'.
														
 
															+#texinfo_show_urls = 'footnote'
														
--- a/docs/corpus_labeling.rst
+++ b/docs/corpus_labeling.rst
@@ -0,0 +1,105 @@
 
															+Creating a reference corpus
														
 
															+===========================
														
 
															+
														
 
															+IEPY provides a web tool for creating a reference corpus in a simple and fast way. 
														
 
															+This corpus can be used for evaluation or simply to have a labeled corpus of 
														
 
															+relations between entity occurrences.
														
 
															+
														
 
															+
														
 
															+Running the web server
														
 
															+----------------------
														
 
															+
														
 
															+First of all, you need to run the web server that will provide the interface.
														
 
															+This is done by running a *Django* server.
														
 
															+
														
 
															+Assuming you have an iepy instance and it's your current directory,
														
 
															+to start the server you need to run 
														
 
															+
														
 
															+.. code-block:: sh
														
 
															+    
														
 
															+    $ python bin/manage.py runserver
														
 
															+
														
 
															+You will see a message like this:
														
 
															+
														
 
															+::
														
 
															+
														
 
															+    Starting development server at http://127.0.0.1:8000/
														
 
															+    Quit the server with CONTROL-C.
														
 
															+
														
 
															+Home page
														
 
															+---------
														
 
															+
														
 
															+At this point, you can go on and open a browser and access the URL `http://127.0.0.1:8000 <http://127.0.0.1:8000/>`_
														
 
															+and you will get a screen like this:
														
 
															+
														
 
															+.. image:: home_screenshot.png
														
 
															+
														
 
															+
														
 
															+After creating a relation, you can access it on the ``Create a reference corpus`` section of the home page.
														
 
															+Once you get there, you'll find that there are two different ways to label evidences: by segment and by document.
														
 
															+The default one is by document but you can switch between both of them.
														
 
															+
														
 
															+
														
 
															+Document based labeling
														
 
															+-----------------------
														
 
															+
														
 
															+This view presents a complete document with all the segments that make sense to show. These are
														
 
															+the ones that have present entities with the entity kind that your relation uses.
														
 
															+
														
 
															+.. image:: label_by_document_screenshot.png
														
 
															+
														
 
															+On the left side of the page, you'll see a list of the options that you have to label the evidences.
														
 
															+To start labeling information what you need to do is choose one of this options, then click on two
														
 
															+entity occurrences (marked in yellow on the text).
														
 
															+
														
 
															+IEPY will only let you click on entity occurrences that has the type that your relation need. Even
														
 
															+when you select the first entity occurrence, you will only be able to click on entities of the other
														
 
															+entity type.
														
 
															+
														
 
															+.. image:: label_by_document_relation_labeled.png
														
 
															+
														
 
															+After saving, IEPY will take you to automatically to the next document.
														
 
															+Also on top you have some navigation controls.
														
 
															+
														
 
															+.. note::
														
 
															+
														
 
															+    Be careful with the navigation buttons because it won't save the changes that you’ve made on this document. 
														
 
															+
														
 
															+
														
 
															+Segment based labeling
														
 
															+----------------------
														
 
															+
														
 
															+When labeling by segment, you are presented with a segment of a document, and you will have to
														
 
															+answer if the relation is present on all the possible combinations of entity occurrences.
														
 
															+
														
 
															+.. image:: label_by_segment_screenshot.png
														
 
															+
														
 
															+Here what you will need to do is complete every evidence whether the relation is present or not.
														
 
															+When saving you will get another segment to label and so on.
														
 
															+
														
 
															+On top you have navigation controls and on the far right you have link to switch view 
														
 
															+for one by document.
														
 
															+
														
 
															+
														
 
															+Fixing mistagged entity occurrences
														
 
															+-----------------------------------
														
 
															+
														
 
															+It is possible that the automatic process that detects entities have been mistaken.
														
 
															+This leads to an entity tagged partially or incorrectly. In this case, we provide a tool to fix this problems.
														
 
															+You can access this tool by right clicking in the problematic entity and choosing **Modify entity occurrence** 
														
 
															+
														
 
															+.. image:: label_by_document_entity_edition.png
														
 
															+
														
 
															+There you can completely remove the entity or change the limits so it holds more (or less) tokens.
														
 
															+
														
 
															+
														
 
															+Creating new occurrences
														
 
															+------------------------
														
 
															+
														
 
															+If an entity occurrence wasn't detected automatically, you can add it manually. To do so, right click on
														
 
															+any token and choose **Create entity occurrence**. 
														
 
															+
														
 
															+.. image:: create_eo.png
														
 
															+
														
 
															+You can modify the limits of the tokens and the entity kind there. After this operation, new *evidence candidates*
														
 
															+will be created if needed.
														
--- a/docs/create_eo.png
+++ b/docs/create_eo.png
--- a/docs/gazettes.rst
+++ b/docs/gazettes.rst
@@ -0,0 +1,45 @@
 
															+Gazettes resolution
														
 
															+===================
														
 
															+
														
 
															+We call a gazette a mapping between a list of tokens and an entity kind. If that list of tokens
														
 
															+matches exactly on your text, then that would be tagged as an entity. 
														
 
															+
														
 
															+All the entities occurrences that where detected by a gazette and share the same set of tokens, will share the same entity.
														
 
															+This means that if you have a gazette that finds ``Dr. House`` and tags it as a ``PERSON``, all the occurrences in the text
														
 
															+that matches those tokens, will belong to the same entity.
														
 
															+
														
 
															+Basic usage: Loading from csv
														
 
															+-----------------------------
														
 
															+
														
 
															+The basic usage would be including a set of gazettes before running the preprocess step. To include
														
 
															+the gazettes on your database, you can use the script ``gazettes_loader.py`` that comes included with
														
 
															+your instance. This will take a csv file with the following format:
														
 
															+
														
 
															+::
														
 
															+
														
 
															+    <literal>,<class>
														
 
															+
														
 
															+Literal can be a single token or multiple tokens separated by space.
														
 
															+The only restriction is that every literal is unique.
														
 
															+
														
 
															+For example, a gazettes csv file could be:
														
 
															+
														
 
															+::
														
 
															+
														
 
															+    literal,class
														
 
															+    Dr. House,PERSON
														
 
															+    Lupus,DISEASE
														
 
															+    Headache,SYMPTOMS
														
 
															+
														
 
															+
														
 
															+Removing elements
														
 
															+-----------------
														
 
															+
														
 
															+When deleting an entity, all the occurrences are deleted with it along the gazette item that introduced them.
														
 
															+Same goes the other way, if you delete a gazette item, the entity, and therefore the occurrences, will be deleted as well.
														
 
															+
														
 
															+To delete a gazette item, go to the database admin page and find the Gazette section. You'll be able to find the one that you want
														
 
															+to remove.
														
 
															+
														
 
															+To remove an entity, find an occurrence by exploring a document on any of its views, and right click it. There you'll find a delete
														
 
															+link that enables you to remove the whole entity. Keep in mind that this action will delete the gazette item.
														
--- a/docs/home_screenshot.png
+++ b/docs/home_screenshot.png
--- a/docs/how_to_hack.rst
+++ b/docs/how_to_hack.rst
@@ -0,0 +1,192 @@
 
															+How to Hack
														
 
															+===========
														
 
															+
														
 
															+There are several places where you can incorporate your own ideas and needs into IEPY.
														
 
															+Here you'll see how to modify different parts of the iepy core.
														
 
															+
														
 
															+Altering how the corpus is created
														
 
															+----------------------------------
														
 
															+
														
 
															+On the `preprocess <preprocess.html#how-to-customize>`_ section was already mentioned that you can customize how the corpus is created.
														
 
															+
														
 
															+
														
 
															+Using your own classifier
														
 
															+-------------------------
														
 
															+
														
 
															+You can change the definition of the *extraction classifier* that is used when running
														
 
															+iepy in *active learning* mode.
														
 
															+
														
 
															+As the simplest example of doing this, check the following example.
														
 
															+First, define your own custom classifier, like this:
														
 
															+
														
 
															+.. code-block:: python
														
 
															+
														
 
															+    from sklearn.linear_model import SGDClassifier
														
 
															+    from sklearn.pipeline import make_pipeline
														
 
															+    from sklearn.feature_extraction.text import CountVectorizer
														
 
															+
														
 
															+
														
 
															+    class MyOwnRelationClassifier:
														
 
															+        def __init__(self, **config):
														
 
															+            vectorizer = CountVectorizer(
														
 
															+                preprocessor=lambda evidence: evidence.segment.text)
														
 
															+            classifier = SGDClassifier()
														
 
															+            self.pipeline = make_pipeline(vectorizer, classifier)
														
 
															+
														
 
															+        def fit(self, X, y):
														
 
															+            self.pipeline.fit(X, y)
														
 
															+            return self
														
 
															+
														
 
															+        def predict(self, X):
														
 
															+            return self.pipeline.predict(X)
														
 
															+
														
 
															+        def decision_function(self, X):
														
 
															+            return self.pipeline.decision_function(X)
														
 
															+
														
 
															+
														
 
															+and later, in iepy_runner.py of your IEPY instance, in the **ActiveLearningCore** creation,
														
 
															+provide it as a configuration parameter like this
														
 
															+
														
 
															+
														
 
															+.. code-block:: python
														
 
															+
														
 
															+    iextractor = ActiveLearningCore(
														
 
															+        relation, labeled_evidences,
														
 
															+        tradeoff=tuning_mode,
														
 
															+        extractor_config={},
														
 
															+        extractor=MyOwnRelationClassifier
														
 
															+    )
														
 
															+
														
 
															+
														
 
															+Implementing your own features
														
 
															+------------------------------
														
 
															+
														
 
															+Your classifier can use features that are already built within iepy or you can create your
														
 
															+own. You can even use a rule (as defined in the :doc:`rules core <rules_tutorial>`) as feature.
														
 
															+
														
 
															+Start by creating a new file in your instance, you can call it whatever you want, but for this
														
 
															+example lets call it ``custom_features.py``. There you'll define your features:
														
 
															+
														
 
															+.. code-block:: python
														
 
															+
														
 
															+    # custom_features.py
														
 
															+    from featureforge.feature import output_schema
														
 
															+
														
 
															+    @output_schema(int, lambda x: x >= 0)
														
 
															+    def tokens_count(evidence):
														
 
															+        return len(evidence.segment.tokens)
														
 
															+
														
 
															+
														
 
															+.. note::
														
 
															+
														
 
															+    Your features can use some of the `Feature Forge's <http://feature-forge.readthedocs.org/en/latest/>`__
														
 
															+    capabilities.
														
 
															+
														
 
															+Once you've defined your feature you can use it in the classifier by adding it to the configuration
														
 
															+file. You should have one on your instance with all the default values, it's called ``extractor_config.json``.
														
 
															+
														
 
															+There you'll find 2 sets of features where you can add it: dense or sparse. Depending on the values returned
														
 
															+by your feature you'll choose one over the other.
														
 
															+
														
 
															+To include it, you have to add a line with a python path to your feature function. If you're not familiarized with
														
 
															+the format you should follow this pattern:
														
 
															+
														
 
															+::
														
 
															+
														
 
															+    {project_name}.{features_file}.{feature_function}
														
 
															+
														
 
															+In our example, our instance is called ``born_date``, so in the config this would be:
														
 
															+
														
 
															+.. code-block:: json
														
 
															+
														
 
															+    "dense_features": [
														
 
															+        ...
														
 
															+        "born_date.custom_features.tokens_count",
														
 
															+        ...
														
 
															+    ],
														
 
															+
														
 
															+Remember that if you want to use that configuration file you have to use the option ``--extractor-config``
														
 
															+
														
 
															+
														
 
															+Using rules as features
														
 
															+-----------------------
														
 
															+
														
 
															+In the same way, and without doing any change to the rule, you can
														
 
															+add it as feature by declaring it in your config like this:
														
 
															+
														
 
															+Suppose your instance is called ``born_date`` and your rule is called ``born_date_in_parenthesis``,
														
 
															+then you'll do:
														
 
															+
														
 
															+
														
 
															+.. code-block:: json
														
 
															+
														
 
															+    "dense_features": [
														
 
															+        ...
														
 
															+        "born_date.rules.born_date_in_parenthesis",
														
 
															+        ...
														
 
															+    ],
														
 
															+
														
 
															+This will run your rule as a feature that returns 0 if it didn't match and 1 if it matched.
														
 
															+
														
 
															+Using all rules as one feature
														
 
															+..............................
														
 
															+
														
 
															+Suppose you have a bunch of rules defined in your rules file and instead of using each rule as a
														
 
															+different feature you want to use a single feature that runs all the rules to test if the evidence
														
 
															+matches. You can write a custom feature that does so. Let's look an example snippet:
														
 
															+
														
 
															+.. code-block:: python
														
 
															+
														
 
															+    # custom_features.py
														
 
															+    import refo
														
 
															+
														
 
															+    from iepy.extraction.rules import compile_rule, generate_tokens_to_match, load_rules
														
 
															+
														
 
															+    rules = load_rules()
														
 
															+
														
 
															+
														
 
															+    def rules_match(evidence):
														
 
															+        tokens_to_match = generate_tokens_to_match(evidence)
														
 
															+
														
 
															+        for rule in rules:
														
 
															+            regex = compile_rule(rule, evidence.relation)
														
 
															+
														
 
															+            if refo.match(regex, tokens_to_match):
														
 
															+                if rule.answer:  # positive rule
														
 
															+                    return 1
														
 
															+                else:  # negative rule
														
 
															+                    return -1
														
 
															+        # no rule matched
														
 
															+        return 0
														
 
															+
														
 
															+
														
 
															+This will define a feature called ``rules_match`` that tries every rule for an evidence
														
 
															+until a match occurs, and returns one of three different values, depending on the type
														
 
															+of match.
														
 
															+
														
 
															+To use this you have to add this single feature to your config like this:
														
 
															+
														
 
															+.. code-block:: json
														
 
															+
														
 
															+    "dense_features": [
														
 
															+        ...
														
 
															+        "born_date.custom_features.rules_match",
														
 
															+        ...
														
 
															+    ],
														
 
															+
														
 
															+
														
 
															+
														
 
															+Documents Metadata
														
 
															+------------------
														
 
															+
														
 
															+While building your application, you might want to store some extra information about your documents.
														
 
															+To avoid loading this data every time when predicting, we've separated the place to put this 
														
 
															+information into another model called **IEDocumentMetadata** that is accessible through the **metadata** attribute.
														
 
															+
														
 
															+IEDocumentMetadata has 3 fields:
														
 
															+
														
 
															+    * title: for storing document's title
														
 
															+    * url: to save the source url if the document came from a web page
														
 
															+    * itmes: a dictionary that you can use to store anything you want.
														
 
															+
														
 
															+By default, the **csv importer** uses the document's metadata to save the filepath of the csv file on the *items* field.
														
--- a/docs/iepy_1.svg
+++ b/docs/iepy_1.svg
--- a/docs/iepy_2.svg
+++ b/docs/iepy_2.svg
--- a/docs/iepy_3.svg
+++ b/docs/iepy_3.svg
--- a/docs/iepy_4.svg
+++ b/docs/iepy_4.svg
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -0,0 +1,85 @@
 
															+.. IEPY documentation master file, created by
														
 
															+   sphinx-quickstart on Wed Apr 23 20:02:15 2014.
														
 
															+   You can adapt this file completely to your liking, but it should at least
														
 
															+   contain the root `toctree` directive.
														
 
															+
														
 
															+Welcome to IEPY's documentation!
														
 
															+================================
														
 
															+
														
 
															+IEPY is an open source tool for
														
 
															+`Information Extraction <http://en.wikipedia.org/wiki/Information_extraction>`_
														
 
															+focused on Relation Extraction.
														
 
															+
														
 
															+To give an example of Relation Extraction, if we are trying to find a
														
 
															+birth date in:
														
 
															+
														
 
															+    `"John von Neumann (December 28, 1903 – February 8, 1957) was a Hungarian and
														
 
															+    American pure and applied mathematician, physicist, inventor and polymath."`
														
 
															+
														
 
															+then IEPY's task is to identify "``John von Neumann``" and
														
 
															+"``December 28, 1903``" as the subject and object entities of the "``was born in``"
														
 
															+relation.
														
 
															+
														
 
															+It's aimed at:
														
 
															+    - :doc:`users <active_learning_tutorial>`
														
 
															+      needing to perform Information Extraction on a large dataset.
														
 
															+    - :doc:`scientists <how_to_hack>`
														
 
															+      wanting to experiment with new IE algorithms.
														
 
															+
														
 
															+
														
 
															+You can follow the development of this project and report issues at http://github.com/machinalis/iepy
														
 
															+or join the mailing list `here <https://groups.google.com/forum/?hl=es-419#%21forum/iepy>`__
														
 
															+
														
 
															+Features
														
 
															+--------
														
 
															+
														
 
															+    - :doc:`A corpus annotation tool <corpus_labeling>`
														
 
															+      with a `web-based UI <corpus_labeling.html#document-based-labeling>`_
														
 
															+    - :doc:`An active learning relation extraction tool <active_learning_tutorial>`
														
 
															+      pre-configured with convenient defaults.
														
 
															+    - :doc:`A rule based relation extraction tool <rules_tutorial>`
														
 
															+      for cases where the documents are semi-structured or high precision is required.
														
 
															+    - A web-based user interface that:
														
 
															+        - Allows layman users to control some aspects of IEPY.
														
 
															+        - Allows decentralization of human input.
														
 
															+    - A shallow entity ontology with coreference resolution via `Stanford CoreNLP <http://nlp.stanford.edu/software/corenlp.shtml>`_
														
 
															+    - :doc:`An easily hack-able active learning core <how_to_hack>`,
														
 
															+      ideal for scientist wanting to experiment with new algorithms.
														
 
															+
														
 
															+
														
 
															+Contents:
														
 
															+---------
														
 
															+
														
 
															+.. toctree::
														
 
															+   :maxdepth: 2
														
 
															+
														
 
															+   installation
														
 
															+   tutorial
														
 
															+   instantiation
														
 
															+   active_learning_tutorial
														
 
															+   rules_tutorial
														
 
															+   preprocess
														
 
															+   gazettes
														
 
															+   corpus_labeling
														
 
															+   how_to_hack
														
 
															+   troubleshooting
														
 
															+   language
														
 
															+
														
 
															+
														
 
															+Authors
														
 
															+-------
														
 
															+
														
 
															+IEPY is © 2014 `Machinalis <http://www.machinalis.com/>`_ in collaboration
														
 
															+with the `NLP Group at UNC-FaMAF <http://pln.famaf.unc.edu.ar/>`_. Its primary
														
 
															+authors are:
														
 
															+
														
 
															+ * Rafael Carrascosa <rcarrascosa@machinalis.com> (rafacarrascosa at github)
														
 
															+ * Javier Mansilla <jmansilla@machinalis.com> (jmansilla at github)
														
 
															+ * Gonzalo García Berrotarán <ggarcia@machinalis.com> (j0hn at github)
														
 
															+ * Franco M. Luque <francolq@famaf.unc.edu.ar> (francolq at github)
														
 
															+ * Daniel Moisset <dmoisset@machinalis.com> (dmoisset at github)
														
 
															+
														
 
															+Changelog
														
 
															+---------
														
 
															+
														
 
															+.. include:: Changelog
														
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -0,0 +1,79 @@
 
															+==================
														
 
															+IEPY installation
														
 
															+==================
														
 
															+
														
 
															+IEPY runs on *python 3*, and it's fully tested with version *3.4*.
														
 
															+These installation notes assume that you have a fresh installation of *Ubuntu 14.04*.
														
 
															+If you are installing IEPY on a different platform, some details
														
 
															+or software versions may be slightly different.
														
 
															+
														
 
															+Because of some of its dependencies, IEPY installation is not a single
														
 
															+pip install, but it's actually not that hard.
														
 
															+
														
 
															+Outline:
														
 
															+    - install some system packages
														
 
															+    - install iepy itself
														
 
															+    - download 3rd party binaries
														
 
															+
														
 
															+
														
 
															+System software needed
														
 
															+----------------------
														
 
															+
														
 
															+You need to install the following packages:
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    sudo apt-get install build-essential python3-dev liblapack-dev libatlas-dev gfortran
														
 
															+
														
 
															+They are needed for python Numpy installation. Once this is done, install numpy by doing:
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    pip install numpy
														
 
															+
														
 
															+
														
 
															+And later, for been able to run some java processes:
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    sudo apt-get install openjdk-7-jre
														
 
															+
														
 
															+.. note::
														
 
															+
														
 
															+    Instead of openjdk-7-jre you can use any other java (version 1.6 or higher) you
														
 
															+    may have.
														
 
															+
														
 
															+    **Java 1.8** will allow you to use the **newest preprocess models**.
														
 
															+
														
 
															+
														
 
															+Install IEPY package
														
 
															+--------------------
														
 
															+
														
 
															+1. :doc:`Create a Virtualenv <virtualenv>`
														
 
															+
														
 
															+2. Install IEPY itself
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    pip install iepy
														
 
															+
														
 
															+3. Configure java & NLTK
														
 
															+
														
 
															+    In order to preprocess documents, set the
														
 
															+    environment variable JAVAHOME=/usr/bin/java (or the path where java was installed)
														
 
															+    To make this configuration persistent, add it to your shell rc file.
														
 
															+
														
 
															+Download the third party data and tools
														
 
															+---------------------------------------
														
 
															+
														
 
															+You should have now a command named "*iepy*". Use it like this to get some required
														
 
															+binaries.
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    iepy --download-third-party-data
														
 
															+
														
 
															+.. note::
														
 
															+
														
 
															+    If the java binary pointed by your JAVAHOME is 1.8, newest preprocess models will
														
 
															+    be acquired and used.
														
--- a/docs/instantiation.rst
+++ b/docs/instantiation.rst
@@ -0,0 +1,137 @@
 
															+Instantiation
														
 
															+=============
														
 
															+
														
 
															+Here, we'll explain in detail what an instantiation contains and what it does.
														
 
															+
														
 
															+Folder structure
														
 
															+----------------
														
 
															+
														
 
															+The folder structure of an iepy instance is the following:
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    ├── __init__.py
														
 
															+    ├── settings.py
														
 
															+    ├── database_name_you_picked.sqlite
														
 
															+    ├── bin
														
 
															+    │   ├── csv_to_iepy.py
														
 
															+    │   ├── iepy_rules_runner.py
														
 
															+    │   ├── iepy_runner.py
														
 
															+    │   ├── manage.py
														
 
															+    │   ├── preprocess.py
														
 
															+    │   └── rules_verifier.py
														
 
															+    ├── extractor_config.json
														
 
															+    └── rules.py
														
 
															+
														
 
															+
														
 
															+Let's see why each one of those files is there:
														
 
															+
														
 
															+
														
 
															+Settings file
														
 
															+.............
														
 
															+
														
 
															+settings.py is a configuration file where you can change the database settings and all the web interface related settings.
														
 
															+This file has a `django settings <https://docs.djangoproject.com/en/1.7/ref/settings/>`_ file format.
														
 
															+
														
 
															+Database
														
 
															+........
														
 
															+
														
 
															+When you create an instance, a *sqlite* database is created by default.
														
 
															+It has no data yet, since you'll have to fill it with your own data.
														
 
															+
														
 
															+When working with big datasets, it's recommended to use some database engine other than *sqlite*.
														
 
															+To change the database engine, change the `DATABASES` section of the settings file:
														
 
															+
														
 
															+::
														
 
															+
														
 
															+    DATABASES = {
														
 
															+        'default': {
														
 
															+            'ENGINE': 'django.db.backends.sqlite3',
														
 
															+            'NAME': 'database_name_you_picked.sqlite',
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+For example, you can use PostgreSQL like this:
														
 
															+
														
 
															+::
														
 
															+
														
 
															+    DATABASES = {
														
 
															+        'default': {
														
 
															+            'ENGINE': 'django.db.backends.postgresql_psycopg2',
														
 
															+            'NAME': 'your_database_name',
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+(Remember that you'll need to install ``psycopg2`` first with a simple ``pip install psycopg2``)
														
 
															+
														
 
															+Take a look at the `django database configuration documentation <https://docs.djangoproject.com/en/dev/ref/settings/#databases>`_ for more detail.
														
 
															+
														
 
															+.. note::
														
 
															+
														
 
															+    Each time you change your database (either the engine or the name) you will have
														
 
															+    to instruct *django* to create all the tables in it, like this:
														
 
															+
														
 
															+    .. code-block:: bash
														
 
															+
														
 
															+        python bin/manage.py migrate
														
 
															+
														
 
															+
														
 
															+Active learning configuration
														
 
															+.............................
														
 
															+
														
 
															+``extractor_config.json`` specifies the configuration of the active learning core in *json* format.
														
 
															+
														
 
															+Rules definition
														
 
															+................
														
 
															+
														
 
															+If you decide to use the rule based core, you'll have to define all your rules in the file ``rules.py``
														
 
															+
														
 
															+You can verify if your rules run correctly using ``bin/rules_verifier.py``.
														
 
															+Read more about it `here <rules_tutorial.html#verifying-your-rules>`__.
														
 
															+
														
 
															+CSV importer
														
 
															+............
														
 
															+
														
 
															+In the ``bin`` folder, you'll find a tool to import data from CSV files. This is the script ``csv_to_iepy.py``.
														
 
															+Your CSV data has to be in the following format:
														
 
															+
														
 
															+::
														
 
															+
														
 
															+    <document_id>, <document_text>
														
 
															+
														
 
															+Preprocess
														
 
															+..........
														
 
															+
														
 
															+To preprocess your data, you will use the  ``bin/preprocess.py`` script. Read more about it :doc:`here <preprocess>`
														
 
															+
														
 
															+Runners
														
 
															+.......
														
 
															+
														
 
															+In the ``bin`` folder, you have scripts to run the active learning core (``iepy_runner.py``) or the
														
 
															+rule based core (``iepy_rules_runner.py``)
														
 
															+
														
 
															+Web UI management
														
 
															+.................
														
 
															+
														
 
															+For the web server management, you have the ``bin/manage.py`` script. This is a `django manage file <https://docs.djangoproject.com/en/1.7/ref/django-admin/>`_
														
 
															+and with it you can start up your server.
														
 
															+
														
 
															+
														
 
															+Instance Upgrade
														
 
															+----------------
														
 
															+
														
 
															+From time to time, small changes in the iepy internals will require an *upgrade* of existing iepy instances.
														
 
															+
														
 
															+The upgrade process will apply the needed changes to the instance-folder structure.
														
 
															+
														
 
															+If you made local changes, the tool will preserve a copy of your changes so you can merge the conflicting areas by hand.
														
 
															+
														
 
															+To upgrade an iepy instance, simply run the following command
														
 
															+
														
 
															+    .. code-block:: bash
														
 
															+
														
 
															+        iepy --upgrade <instance path>
														
 
															+
														
 
															+.. note::
														
 
															+
														
 
															+    Look at the settings file to find the version of any iepy instance.
														
--- a/docs/label_by_document_entity_edition.png
+++ b/docs/label_by_document_entity_edition.png
--- a/docs/label_by_document_relation_labeled.png
+++ b/docs/label_by_document_relation_labeled.png
--- a/docs/label_by_document_screenshot.png
+++ b/docs/label_by_document_screenshot.png
--- a/docs/label_by_segment_screenshot.png
+++ b/docs/label_by_segment_screenshot.png
--- a/docs/labels_by_iepy.png
+++ b/docs/labels_by_iepy.png
--- a/docs/language.rst
+++ b/docs/language.rst
@@ -0,0 +1,60 @@
 
															+==================
														
 
															+Language support
														
 
															+==================
														
 
															+
														
 
															+By default IEPY will use English models, but it's also able to work with different
														
 
															+languages.
														
 
															+
														
 
															+The preprocess machinery that's provided by default (Stanford Core NLP) has support
														
 
															+for some other languages, so, check their models and documentation in case you need this.
														
 
															+
														
 
															+.. note::
														
 
															+
														
 
															+    The main goal until now was to architecture IEPY to allow different languages.
														
 
															+    Right now, the only fully supported languages are English, Spanish and German. If you need
														
 
															+    something else, do not hesitate in contacting us.
														
 
															+
														
 
															+
														
 
															+Language Installation and Models
														
 
															+--------------------------------
														
 
															+
														
 
															+The language models used by IEPY (the information used during preprocessing phase)
														
 
															+are stored on your IEPY installation. Several models for different languages can be
														
 
															+installed on the same installation.
														
 
															+
														
 
															+In order to download Spanish models you should run
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    iepy --download-third-party-data --lang=es
														
 
															+
														
 
															+
														
 
															+In order to download German models you should run
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    iepy --download-third-party-data --lang=de
														
 
															+
														
 
															+
														
 
															+.. note::
														
 
															+
														
 
															+    Check Stanford Core NLP documentation and files to download for more language packages.
														
 
															+
														
 
															+
														
 
															+Language Definition and Instances
														
 
															+---------------------------------
														
 
															+
														
 
															+Every IEPY instance works for a single language, which is declared on the settings.py file like this:
														
 
															+
														
 
															+To change the instance language, change the settings file on the section where it says `IEPY_VERSION`:
														
 
															+
														
 
															+::
														
 
															+
														
 
															+    IEPY_VERSION = 'en'
														
 
															+
														
 
															+
														
 
															+To create an IEPY instance for a different language, you should run
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    iepy --create --lang=es <folder_path>
														
--- a/docs/preprocess.rst
+++ b/docs/preprocess.rst
@@ -0,0 +1,231 @@
 
															+About the Pre-Process
														
 
															+=====================
														
 
															+
														
 
															+The preprocessing adds the metadata that iepy needs to detect the relations, which includes:
														
 
															+
														
 
															+    * Text tokenization and sentence splitting.
														
 
															+    * Text lemmatization
														
 
															+    * Part-Of-Speech (POS) tagging.
														
 
															+    * Named Entity Recognition (NER).
														
 
															+    * Gazettes resolution
														
 
															+    * Syntactic parsing.
														
 
															+    * TextSegments creation (internal IEPY text unit).
														
 
															+
														
 
															+We're currently running all this steps (except the last one) using the `Stanford CoreNLP <http://nlp.stanford.edu/software/corenlp.shtml>`_ tools.
														
 
															+This runs in a all-in-one run, but every step can be :ref:`modified to use a custom version <customize>` that adjust your needs.
														
 
															+
														
 
															+
														
 
															+About the Tokenization and Sentence splitting
														
 
															+---------------------------------------------
														
 
															+
														
 
															+The text of each Document is split on tokens and sentences, and that information is stored
														
 
															+on the document itself, preserving (and also storing) for each token the offset (in chars)
														
 
															+to the original document text.
														
 
															+
														
 
															+The one used by default it's the one that the `Stanford CoreNLP <http://nlp.stanford.edu/software/corenlp.shtml>`_ provides.
														
 
															+
														
 
															+.. note::
														
 
															+
														
 
															+    While using the Stanford tokenizer, you can customize some of tokenization options.
														
 
															+
														
 
															+    First read here: `tokenizer options <http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/process/PTBTokenizer.html>`_
														
 
															+
														
 
															+    On your instance *settings.py* file, add options as keys on the CORENLP_TKN_OPTS dict.
														
 
															+    You can use as key any of the "known options", and as value,
														
 
															+    use True or False for booleans, or just strings when option requires a text.
														
 
															+    Example:
														
 
															+
														
 
															+    .. code-block:: python
														
 
															+
														
 
															+        CORENLP_TKN_OPTS = {
														
 
															+            'latexQuotes': False
														
 
															+        }
														
 
															+
														
 
															+
														
 
															+Lemmatization
														
 
															+-------------
														
 
															+
														
 
															+.. note::
														
 
															+
														
 
															+    Lemmatization was added on the version 0.9.2, all instances that were created before that,
														
 
															+    need to run the preprocess script again. This will run only the lemmatization step.
														
 
															+
														
 
															+The text runs through a step of lemmatization where each token gets a lemma. This is a canonical form of the word that
														
 
															+can be used in the classifier features or the rules core.
														
 
															+
														
 
															+
														
 
															+Part of speech tagging (POS)
														
 
															+----------------------------
														
 
															+
														
 
															+Each token is augmented with metadata about its part of speech such as noun, verb,
														
 
															+adjective and other grammatical tags.
														
 
															+Along the token itself, this may used by the NER to detect an entity occurrence.
														
 
															+This information is also stored on the Document itself, together with the tokens.
														
 
															+
														
 
															+The one used by default it's the one that the `Stanford CoreNLP <http://nlp.stanford.edu/software/corenlp.shtml>`_ provides.
														
 
															+
														
 
															+Named Entity Recognition (NER)
														
 
															+------------------------------
														
 
															+
														
 
															+To find a relation between entities one must first recognize these entities in the text.
														
 
															+
														
 
															+As an result of NER, each document is added with information about all the found
														
 
															+Named Entities (together with which tokens are involved in each occurrence).
														
 
															+
														
 
															+An automatic NER is used to find occurrences of an entity in the text.
														
 
															+
														
 
															+The default pre-process uses the Stanford NER, check the Stanford CoreNLP's `documentation <http://nlp.stanford.edu/software/corenlp.shtml>`_
														
 
															+to find out which entity kinds are supported, but includes:
														
 
															+
														
 
															+    * Location
														
 
															+    * Person
														
 
															+    * Organization
														
 
															+    * Date
														
 
															+    * Number
														
 
															+    * Time
														
 
															+    * Money
														
 
															+    * Percent
														
 
															+
														
 
															+Others remarkable features of this NER (that are incorporated to the default pre-process) are:
														
 
															+
														
 
															+    - pronoun resolution
														
 
															+    - simple co-reference resolution
														
 
															+
														
 
															+This step can be customized to find entities of kinds defined by you, or anything else you may need.
														
 
															+
														
 
															+Gazettes resolution
														
 
															+-------------------
														
 
															+
														
 
															+In case you want to add named entity recognition by matching literals, iepy provides a system of gazettes.
														
 
															+This is a mapping of literals and entity kinds that will be run on top of the basic stanford NER.
														
 
															+With this, you'll be able to recognize entities out of the ones done by the stanford NER, or even correct
														
 
															+those that are incorrectly tagged.
														
 
															+
														
 
															+:doc:`Learn more about here. <gazettes>`
														
 
															+
														
 
															+
														
 
															+Syntactic parsing
														
 
															+-----------------
														
 
															+
														
 
															+.. note::
														
 
															+
														
 
															+    Syntactic parsing was added on the version 0.9.3, all instances that were created before that,
														
 
															+    need to run the preprocess script again. This will run only the syntactic parsing step.
														
 
															+
														
 
															+The sentences are parsed to works out the syntactic structure. Each sentence gets an structure tree
														
 
															+that is stored in `Penn Treebank notation <http://en.wikipedia.org/wiki/Treebank>`__. IEPY presents
														
 
															+this to the user using a `NLTK Tree object <http://www.nltk.org/howto/tree.html>`__.
														
 
															+
														
 
															+By default the sentences are processed with the `Stanford Parser <http://nlp.stanford.edu/software/lex-parser.shtml>`__
														
 
															+provided within the `Stanford CoreNLP <http://nlp.stanford.edu/software/corenlp.shtml>`__.
														
 
															+
														
 
															+For example, the syntactic parsing of the sentence ``Join the dark side, we have cookies`` would be:
														
 
															+
														
 
															+::
														
 
															+
														
 
															+    (ROOT
														
 
															+      (S
														
 
															+        (S
														
 
															+          (VP (VBN Join)
														
 
															+            (NP (DT the) (JJ dark) (NN side))))
														
 
															+        (, ,)
														
 
															+        (NP (PRP we))
														
 
															+        (VP (VBP have)
														
 
															+          (NP (NNS cookies)))))
														
 
															+
														
 
															+About the Text Segmentation
														
 
															+---------------------------
														
 
															+
														
 
															+IEPY works on a **text segment** (or simply **segment**) level, meaning that will
														
 
															+try to find if a relation is present within a segment of text. The
														
 
															+pre-process is the responsible for splitting the documents into segments.
														
 
															+
														
 
															+The default pre-process uses a segmenter that creates for documents with the following criteria:
														
 
															+
														
 
															+ * for each sentence on the document, if there are at least 2 Entity Occurrences in there
														
 
															+
														
 
															+
														
 
															+.. _customize:
														
 
															+
														
 
															+How to customize
														
 
															+----------------
														
 
															+
														
 
															+On your own IEPY instances, there's a file called ``preprocess.py`` located in the ``bin`` folder.
														
 
															+There you'll find that the default is simply run the Stanford preprocess, and later the segmenter.
														
 
															+This can be changed to run a sequence of steps defined by you
														
 
															+
														
 
															+For example, take this pseudo-code to guide you:
														
 
															+
														
 
															+.. code-block:: python
														
 
															+
														
 
															+    pipeline = PreProcessPipeline([
														
 
															+        CustomTokenizer(),
														
 
															+        CustomSentencer(),
														
 
															+        CustomLemmatizer(),
														
 
															+        CustomPOSTagger(),
														
 
															+        CustomNER(),
														
 
															+        CustomSegmenter(),
														
 
															+    ], docs)
														
 
															+    pipeline.process_everything()
														
 
															+
														
 
															+
														
 
															+.. note::
														
 
															+
														
 
															+    The steps can be functions or callable objects. We recommend objects because generally you'll
														
 
															+    want to do some load up of things on the `__init__` method to avoid loading everything over and over again.
														
 
															+
														
 
															+Each one of those steps will be called with each one of the documents, meaning that every step will be called
														
 
															+with all the documents, after finishing with that the next step will be called with each one of the documents.
														
 
															+
														
 
															+
														
 
															+Running in multiple cores
														
 
															+-------------------------
														
 
															+
														
 
															+Preprocessing might take a lot of time. To handle this you can run the preprocessing on several cores of the
														
 
															+same machine or even run it on differents machines to accelerate the processing.
														
 
															+
														
 
															+To run it on the same machine using multiple cores, all you need to do is run:
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    $ python bin/preprocess.py --multiple-cores=all
														
 
															+
														
 
															+This will use all the available cores. You can also specify a number if you want to
														
 
															+use less than that, like this:
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    $ python bin/preprocess.py --multiple-cores=2
														
 
															+
														
 
															+Running in multiple machines
														
 
															+----------------------------
														
 
															+
														
 
															+Running the preprocess on different machines it's a bit tricky, here's what you'll need:
														
 
															+
														
 
															+    * A iepy instance with a database that allows remote access (such as postgres)
														
 
															+    * One iepy instance on each extra machine that has the database setting pointing to the main one.
														
 
															+
														
 
															+Then you'll need to decide on how many parts do you want to split the document set
														
 
															+and run each part on a different machine. For example, you could split the documents in 4 and run 2 processes
														
 
															+on one machine and 2 on another one. To do this you'll run:
														
 
															+
														
 
															+
														
 
															+On one of the machines, in two different consoles run:
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    $ python bin/preprocess.py --split-in=4 --run-part=1
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    $ python bin/preprocess.py --split-in=4 --run-part=2
														
 
															+
														
 
															+And on the other machine:
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    $ python bin/preprocess.py --split-in=4 --run-part=3
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    $ python bin/preprocess.py --split-in=4 --run-part=4
														
--- a/docs/rules_tutorial.rst
+++ b/docs/rules_tutorial.rst
@@ -0,0 +1,188 @@
 
															+Running the rule based core
														
 
															+===========================
														
 
															+
														
 
															+Here we will guide you through the steps to use the rule based system
														
 
															+to detect relations on the documents.
														
 
															+
														
 
															+
														
 
															+How they work
														
 
															+-------------
														
 
															+
														
 
															+In the rule based system, you have to define a set of "regular expression like" rules
														
 
															+that will be tested against the segments of the documents. Roughly speaking,
														
 
															+if a rule matches it means that the relation is present.
														
 
															+
														
 
															+This is used to acquire high precision because you control exactly what is matched.
														
 
															+
														
 
															+
														
 
															+Anatomy of a rule
														
 
															+-----------------
														
 
															+
														
 
															+.. note::
														
 
															+    If you don't know how to define a python function,
														
 
															+    `check this out <https://docs.python.org/3/tutorial/controlflow.html#defining-functions>`_
														
 
															+
														
 
															+
														
 
															+A rule is basically a *decorated python function*.
														
 
															+We will see where this needs to be added later, for now lets concentrate on how it is written.
														
 
															+
														
 
															+.. code-block:: python
														
 
															+
														
 
															+    @rule(True)
														
 
															+    def born_date_and_death_in_parenthesis(Subject, Object):
														
 
															+        """ Example: Carl Bridgewater (January 2, 1965 - September 19, 1978) was shot dead """
														
 
															+        anything = Star(Any())
														
 
															+        return Subject + Pos("-LRB-") + Object + Token("-") + anything + Pos("-RRB-") + anything
														
 
															+
														
 
															+First you have to specify that your function is in fact a rule by using the **decorator @rule**.
														
 
															+
														
 
															+As you can see in the first line, this is added on top of the function.
														
 
															+In this decorator you have to define if the rule is going to be *positive* or *negative*. A positive
														
 
															+rule that matches will label the relations as present and a negative one will label it as not present.
														
 
															+You can define this by passing the True or False parameter to the rule decorator.
														
 
															+
														
 
															+Then it comes the definition of the function. This functions takes two parameters: the **Subject** and the **Object**.
														
 
															+This are patterns that will be part of the regex that the function has to return.
														
 
															+
														
 
															+After that it comes the body of the function. Here it has to be constructed the regular expression and needs to be
														
 
															+returned by the function.  This is not an ordinary regular expression, it
														
 
															+uses `ReFO <https://github.com/machinalis/refo>`_.
														
 
															+In ReFO you have to operate with objects that does some kind of check to the text segment.
														
 
															+
														
 
															+For our example, we've chosen to look for the *Was Born* relation. Particularly we look for the date of birth of a
														
 
															+person when it is written like this:
														
 
															+
														
 
															+::
														
 
															+
														
 
															+    Carl Bridgewater (January 2, 1965 - September 19, 1978)
														
 
															+
														
 
															+To match this kind of cases, we have to specify the regex as a sum of predicates. This will check if every
														
 
															+part matches.
														
 
															+
														
 
															+Rule's building blocks
														
 
															+----------------------
														
 
															+
														
 
															+Aside of every ReFO predicates, iepy comes with a bunch that you will find useful for creating your own rules
														
 
															+
														
 
															+    * **Subject**: matches the evidence's left part.
														
 
															+    * **Object**: matches the evidence's right part.
														
 
															+    * **Token**: matches if the token is literally the one specified.
														
 
															+    * **Lemma**: matches if the lemma literally the one specified.
														
 
															+    * **Pos**: matches the *part of speech* of the token examined.
														
 
															+    * **Kind**: matches if the token belongs to an entity occurrence with a given kind.
														
 
															+
														
 
															+
														
 
															+Setting priority
														
 
															+----------------
														
 
															+
														
 
															+Using the **rule decorator**, you can set that a rule is more important than another, and because of that it should
														
 
															+try to match before.
														
 
															+
														
 
															+IEPY will run the rules ordered decreasingly by its priority number, and the default priority is 0.
														
 
															+
														
 
															+For example, to set a priority of 1 you do:
														
 
															+
														
 
															+.. code-block:: python
														
 
															+
														
 
															+    @rule(True, priority=1)
														
 
															+    def rule_name(Subject, Object):
														
 
															+        ...
														
 
															+
														
 
															+
														
 
															+Negative rules
														
 
															+--------------
														
 
															+
														
 
															+If you spot that your rules are matching things erroneously, you can write a rule
														
 
															+that catches that before it is taken by a positive rule.
														
 
															+
														
 
															+You do this by setting the rule as a *negative rule* using the decorator. Also is
														
 
															+recommended to set higher priority so it is checked before the other ones.
														
 
															+
														
 
															+Example:
														
 
															+
														
 
															+
														
 
															+.. code-block:: python
														
 
															+
														
 
															+    @rule(False, priority=1)
														
 
															+    def incorrect_labeling_of_place_as_person(Subject, Object):
														
 
															+        """
														
 
															+        Ex:  Sophie Christiane of Wolfstein (24 October 24, 1667 - 23 August 1737)
														
 
															+
														
 
															+        Wolfstein is a *place*, not a *person*
														
 
															+        """
														
 
															+        anything = Star(Any())
														
 
															+        person = Plus(Pos("NNP") + Question(Token(",")))
														
 
															+        return anything + person + Token("of") + Subject + anything
														
 
															+
														
 
															+
														
 
															+Note that the parameters of the rule decorator are **False** and **priority=1**
														
 
															+
														
 
															+Where do I place the rules
														
 
															+--------------------------
														
 
															+
														
 
															+On your project's instance folder, there should be a *rules.py* file. All rules should be place
														
 
															+there along with a  **RELATION** variable that sets which relation is going to be used.
														
 
															+
														
 
															+This is the file that will be loaded when you run the *iepy_rules_runner*.
														
 
															+
														
 
															+
														
 
															+Example
														
 
															+-------
														
 
															+
														
 
															+This is a portion of the example provided with IEPY, you can view the `complete
														
 
															+file here <https://github.com/machinalis/iepy/blob/develop/examples/birthdate/was_born_rules_sample.py>`__.
														
 
															+
														
 
															+.. code-block:: python
														
 
															+
														
 
															+    from refo import Question, Star, Any, Plus
														
 
															+    from iepy.extraction.rules import rule, Token, Pos
														
 
															+
														
 
															+    RELATION = "was born"
														
 
															+
														
 
															+    @rule(True)
														
 
															+    def was_born_explicit_mention(Subject, Object):
														
 
															+        """
														
 
															+        Ex: Shamsher M. Chowdhury was born in 1950.
														
 
															+        """
														
 
															+        anything = Star(Any())
														
 
															+        return anything + Subject + Token("was born") + Pos("IN") + Object + anything
														
 
															+
														
 
															+
														
 
															+    @rule(True)
														
 
															+    def is_born_in(Subject, Object):
														
 
															+        """
														
 
															+        Ex: Xu is born in 1902 or 1903 in a family of farmers in Hubei ..
														
 
															+        """
														
 
															+        anything = Star(Any())
														
 
															+        return Subject + Token("is born in") + Object + anything
														
 
															+
														
 
															+
														
 
															+    @rule(True)
														
 
															+    def just_born(Subject, Object):
														
 
															+        """
														
 
															+        Ex: Lyle Eugene Hollister, born 6 July 1923 in Sioux Falls, South Dakota, enlisted in the Navy....
														
 
															+        """
														
 
															+        anything = Star(Any())
														
 
															+        return Subject + Token(", born") + Object + anything
														
 
															+
														
 
															+
														
 
															+Verifying your rules
														
 
															+--------------------
														
 
															+
														
 
															+During the construction of your rules, you might want to check whether if the rules are matching or if they
														
 
															+aren't. Even more, if you have tagged data in your corpus, you can know how good is the performance.
														
 
															+
														
 
															+The rules verifier is located on your instance under the ``bin`` directory, it's called ``rules_verifier.py``
														
 
															+
														
 
															+You can run the verifier with every rule or with a single rule, on all of the segments or in a sample of those.
														
 
															+Take a look at the parameters on the rules verifier to find out how to use them by running:
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    $ python bin/rules_verifier.py --help
														
 
															+
														
 
															+If you have labeled data on your corpus, the run will calculate how it scored in terms of precision, recall and
														
 
															+other metrics. You have to keep in mind that this is not exactly what you'll get when you run the rules core, even
														
 
															+if you run the verifier with all the rules and all the data, the numbers are going to be a little different because
														
 
															+this will run every evidence with every rule, and the core instead stops at the first match. This is just a warning so you
														
 
															+don't get too excited or too depressed with these results.
														
--- a/docs/setup/requirements-base.txt
+++ b/docs/setup/requirements-base.txt
@@ -0,0 +1,2 @@
 
															+# because of https://github.com/machinalis/iepy/issues/63
														
 
															+-e .
														
--- a/docs/setup/requirements-development.txt
+++ b/docs/setup/requirements-development.txt
@@ -0,0 +1,3 @@
 
															+-r requirements-base.txt
														
 
															+Sphinx==1.2.2
														
 
															+pygal==1.4.6
														
--- a/docs/setup/system_packages.txt
+++ b/docs/setup/system_packages.txt
--- a/docs/setup/third_party.txt
+++ b/docs/setup/third_party.txt
@@ -0,0 +1,4 @@
 
															+There is a script to download 3rd party data in scripts/download_third_party_data.py
														
 
															+Currently it downloads:
														
 
															+    - The stanford POS and NES tagger
														
 
															+    - punktokenizer
														
--- a/docs/troubleshooting.rst
+++ b/docs/troubleshooting.rst
@@ -0,0 +1,38 @@
 
															+==================
														
 
															+Troubleshooting
														
 
															+==================
														
 
															+
														
 
															+
														
 
															+32 bit architecture issues
														
 
															+--------------------------
														
 
															+
														
 
															+We've experience some memory issues when using a computer with 32 bit architecture. This is because by default we use the
														
 
															+Stanford CoreNLP (java based), which has some special needs about the memory. Read about them more in detail `here <http://nlp.stanford.edu/software/tagger.shtml>`__
														
 
															+
														
 
															+We quote:
														
 
															+
														
 
															+    The system requires Java 1.8+ to be installed. Depending on whether you're running 32 or 64 bit Java and the complexity of the tagger model, you'll need somewhere between 60 and 200 MB of memory to run a trained tagger (i.e., you may need to give java an option like java -mx200m)
														
 
															+
														
 
															+What have worked for us is adding the following environment variable before running iepy:
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    export _JAVA_OPTIONS='-Xms1024M -Xmx1024m'
														
 
															+
														
 
															+You can modify those numbers to your convenience.
														
 
															+
														
 
															+
														
 
															+Preprocess not running under MacOS
														
 
															+----------------------------------
														
 
															+
														
 
															+    Problems with the preprocess under MacOS? Apparently a change in the CoreNLP script is needed to
														
 
															+    be run. You need to change the file ``corenlp.sh`` that is located on
														
 
															+    ``/Users/<your user>/Library/Application Support/iepy/stanford-corenlp-full-2014-08-27/``
														
 
															+    and change ``scriptdir=`dirname $0``` for ``scriptdir=`dirname "$0"``` (ie, add double quotes around ``$0``)
														
 
															+
														
 
															+
														
 
															+Can't install IEPY with python 2
														
 
															+--------------------------------
														
 
															+
														
 
															+  Indeed, IEPY works with Python 3.4 or higher.
														
 
															+
														
--- a/docs/tutorial.rst
+++ b/docs/tutorial.rst
@@ -0,0 +1,110 @@
 
															+From 0 to IEPY
														
 
															+==============
														
 
															+
														
 
															+In this tutorial we will guide you through the steps to create your first
														
 
															+Information Extraction application with IEPY.
														
 
															+Be sure you have a working :doc:`installation <installation>`.
														
 
															+
														
 
															+IEPY internally uses `Django <https://www.djangoproject.com/>`_ to define the database models,
														
 
															+and to provide a web interface. You'll see some components of Django around the project, such as the
														
 
															+configuration file (with the database definition) and the ``manage.py`` utility. If you're familiar
														
 
															+with Django, you will move faster in some of the steps.
														
 
															+
														
 
															+
														
 
															+0 - Creating an instance of IEPY
														
 
															+--------------------------------
														
 
															+
														
 
															+To work with IEPY, you'll have to create an *instance*.
														
 
															+This is going to be where the configuration, database and some binary files are stored.
														
 
															+To create a new instance you have to run:
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    iepy --create <project_name>
														
 
															+
														
 
															+Where *<project_name>* is something that you choose.
														
 
															+This command will ask you a few things such as database name, its username and its password.
														
 
															+When that's done, you'll have an instance in a folder with the name that you chose.
														
 
															+
														
 
															+Read more about the instantiation process :doc:`here <instantiation>`.
														
 
															+
														
 
															+
														
 
															+1 - Loading the database
														
 
															+------------------------
														
 
															+
														
 
															+The way we load the data into the database is importing it from a *csv* file. You can use the script **csv_to_iepy**
														
 
															+provided in your application folder to do it.
														
 
															+
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    python bin/csv_to_iepy.py data.csv
														
 
															+
														
 
															+This will load **data.csv** into the database, from which the data will subsequently be accessed.
														
 
															+
														
 
															+Learn more about the required CSV file format `here <instantiation.html#csv-importer>`_.
														
 
															+
														
 
															+
														
 
															+.. note::
														
 
															+
														
 
															+    You might also provide a *gziped csv file.*
														
 
															+
														
 
															+
														
 
															+2 - Pre-processing the data
														
 
															+---------------------------
														
 
															+
														
 
															+Once you have your database with the documents you want to analyze, you have to
														
 
															+run them through the pre-processing pipeline to generate all the information needed by IEPY's core.
														
 
															+
														
 
															+The pre-processing pipeline runs a series of steps such as 
														
 
															+text tokenization, sentence splitting, lemmatization, part-of-speech tagging,
														
 
															+and named entity recognition
														
 
															+
														
 
															+:doc:`Read more about the pre-processing pipeline here. <preprocess>`
														
 
															+
														
 
															+Your IEPY application comes with code to run all the pre-processing steps.
														
 
															+You can run it by doing:
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    python bin/preprocess.py
														
 
															+
														
 
															+This *will* take a while, especially if you have a lot of data.
														
 
															+
														
 
															+
														
 
															+
														
 
															+3 - Open the web interface
														
 
															+--------------------------
														
 
															+
														
 
															+To help you control IEPY, you have a web user interface.
														
 
															+Here you can manage your database objects and label the information
														
 
															+that the active learning core will need.
														
 
															+
														
 
															+To access the web UI, you must run the web server. Don't worry, you have everything
														
 
															+that you need on your instance folder and it's as simple as running:
														
 
															+
														
 
															+.. code-block:: bash
														
 
															+
														
 
															+    python bin/manage.py runserver
														
 
															+
														
 
															+Leave that process running, and open up a browser at `http://127.0.0.1:8000 <http://127.0.0.1:8000>`_ to view
														
 
															+the user interface home page.
														
 
															+
														
 
															+Now it's time for you to *create a relation definition*. Use the web interface to create the relation that you
														
 
															+are going to be using.
														
 
															+
														
 
															+IEPY
														
 
															+----
														
 
															+
														
 
															+Now, you're ready to run either the :doc:`active learning core <active_learning_tutorial>`
														
 
															+or the :doc:`rule based core <rules_tutorial>`.
														
 
															+
														
 
															+
														
 
															+Constructing a reference corpus
														
 
															+-------------------------------
														
 
															+
														
 
															+To test information extraction performance, IEPY provides a tool for labeling the entire corpus "by hand"
														
 
															+and the check the performance experimenting with that data.
														
 
															+
														
 
															+If you would like to create a labeled corpus to test the performance or for other purposes, take a look at
														
 
															+the :doc:`corpus labeling tool <corpus_labeling>`
														
--- a/docs/virtualenv.rst
+++ b/docs/virtualenv.rst
@@ -0,0 +1,19 @@
 
															+Virtualenv creation
														
 
															+-------------------
														
 
															+
														
 
															+For organization sake, its strongly recommended to make all the IEPY
														
 
															+installation inside a virtual python environment.
														
 
															+
														
 
															+We shouldn't be explaining how to create it here, so we wont.
														
 
															+There is way better documentation
														
 
															+`here <https://docs.python.org/3.4/library/venv.html>`__
														
 
															+for python 3.4.
														
 
															+
														
 
															+Just make sure of have it created and activated while following the
														
 
															+IEPY installation instructions.
														
 
															+Some small notes before leading you to the good documentation:
														
 
															+
														
 
															+ - If you are working with python3.3 (or 3.4 but with the buggy ubuntu/debian release),
														
 
															+   be warn that you will need to install *pip* by hand,
														
 
															+   as explained `here <http://pip.readthedocs.org/en/latest/installing.html#install-pip>`__
														
 
															+ - Alternatively, create your virtualenv with `virtualenvwrapper <http://virtualenvwrapper.readthedocs.org/en/latest/install.html#basic-installation>`_
														
--- a/examples/birthdate/scripts/create_birthdate_relation.py
+++ b/examples/birthdate/scripts/create_birthdate_relation.py
@@ -0,0 +1,7 @@
 
															+from iepy.data.models import Relation, EntityKind
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    person = EntityKind.objects.get_or_create(name="PERSON")[0]
														
 
															+    date = EntityKind.objects.get_or_create(name="DATE")[0]
														
 
															+    Relation.objects.get_or_create(name="BIRTHDATE", left_entity_kind=person, right_entity_kind=date)
														
--- a/examples/birthdate/scripts/csv_to_iepy.py
+++ b/examples/birthdate/scripts/csv_to_iepy.py
@@ -0,0 +1,47 @@
 
															+"""
														
 
															+Birthdate corpus preprocessing script.
														
 
															+
														
 
															+Usage:
														
 
															+    csv_to_iepy.py <filename>
														
 
															+    csv_to_iepy.py -h | --help
														
 
															+
														
 
															+The <filename> argument can be a .csv file or a .csv.gz file containing the
														
 
															+corpus in two columns: 'freebase_mid' and 'description'.
														
 
															+
														
 
															+Options:
														
 
															+  -h --help             Show this screen
														
 
															+"""
														
 
															+import logging
														
 
															+import csv
														
 
															+import gzip
														
 
															+import os
														
 
															+
														
 
															+from docopt import docopt
														
 
															+
														
 
															+from iepy.data.db import DocumentManager
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    logging.basicConfig(level=logging.INFO,
														
 
															+                        format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s")
														
 
															+    opts = docopt(__doc__, version=0.1)
														
 
															+
														
 
															+    name = opts["<filename>"]
														
 
															+    if name.endswith(".gz"):
														
 
															+        fin = gzip.open(name, "rt")
														
 
															+    else:
														
 
															+        fin = open(name, "rt")
														
 
															+    reader = csv.DictReader(fin)
														
 
															+    name = os.path.basename(name)
														
 
															+
														
 
															+    docdb = DocumentManager()
														
 
															+
														
 
															+    seen = set()
														
 
															+    for i, d in enumerate(reader):
														
 
															+        mid = d["freebase_mid"]
														
 
															+        if mid in seen:
														
 
															+            continue
														
 
															+        seen.add(mid)
														
 
															+        docdb.create_document(identifier=mid,
														
 
															+                              text=d["description"],
														
 
															+                              metadata={"input_filename": name})
														
--- a/examples/birthdate/scripts/preprocess.py
+++ b/examples/birthdate/scripts/preprocess.py
@@ -0,0 +1,34 @@
 
															+"""
														
 
															+Birthdate corpus preprocessing script
														
 
															+
														
 
															+Usage:
														
 
															+    preprocess.py
														
 
															+    preprocess.py -h | --help | --version
														
 
															+
														
 
															+Options:
														
 
															+  -h --help             Show this screen
														
 
															+  --version             Version number
														
 
															+"""
														
 
															+import logging
														
 
															+
														
 
															+from docopt import docopt
														
 
															+
														
 
															+from iepy.data.db import DocumentManager
														
 
															+from iepy.preprocess.stanford_preprocess import StanfordPreprocess
														
 
															+from iepy.preprocess.pipeline import PreProcessPipeline
														
 
															+from iepy.preprocess.segmenter import SyntacticSegmenterRunner
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    logger = logging.getLogger(u'preprocess')
														
 
															+    logger.setLevel(logging.INFO)
														
 
															+    logging.basicConfig(level=logging.INFO,
														
 
															+                        format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s")
														
 
															+    opts = docopt(__doc__, version=0.1)
														
 
															+    docs = DocumentManager()
														
 
															+    pipeline = PreProcessPipeline([
														
 
															+        StanfordPreprocess(),
														
 
															+        SyntacticSegmenterRunner(increment=True)
														
 
															+    ], docs
														
 
															+    )
														
 
															+    pipeline.process_everything()
														
--- a/examples/birthdate/settings.py
+++ b/examples/birthdate/settings.py
@@ -0,0 +1,42 @@
 
															+"""
														
 
															+For more information on this file, see
														
 
															+https://docs.djangoproject.com/en/1.7/topics/settings/
														
 
															+
														
 
															+For the full list of settings and their values, see
														
 
															+https://docs.djangoproject.com/en/1.7/ref/settings/
														
 
															+"""
														
 
															+
														
 
															+from iepy.webui.webui.settings import *
														
 
															+
														
 
															+IEPY_VERSION = '0.9.6'
														
 
															+IEPY_LANG = 'en'
														
 
															+SECRET_KEY = 'u==!fueit=wxo&j8!5u+sfasp4prjluk@*s=7!-wz_&r@pn))r'
														
 
															+DEBUG = True
														
 
															+TEMPLATE_DEBUG = True
														
 
															+
														
 
															+# Database
														
 
															+# https://docs.djangoproject.com/en/1.7/ref/settings/#databases
														
 
															+# DATABASES = {
														
 
															+#     'default': {
														
 
															+#         'ENGINE': 'django.db.backends.sqlite3',
														
 
															+#         'NAME': '/home/python/luojiehua/dl_nlp/iepy-develop/examples/test/test.sqlite',
														
 
															+#     }
														
 
															+# }
														
 
															+DATABASES = {
														
 
															+    'default': {
														
 
															+        'ENGINE': 'django.db.backends.postgresql_psycopg2',
														
 
															+        'NAME': 'iepy',
														
 
															+        'USER': 'postgres',
														
 
															+        'PASSWORD': 'postgres',
														
 
															+        'HOST': 'localhost',
														
 
															+        'PORT': '5432'
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+# For changing tokenization options, read here.
														
 
															+# http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/process/PTBTokenizer.html
														
 
															+# You can use as key any of the "known options" listed on that page, and as value,
														
 
															+# use True or False (python names) for booleans, or strings when option requires a text
														
 
															+# CORENLP_TKN_OPTS = {
														
 
															+#     'latexQuotes': False
														
 
															+# }
														
--- a/examples/birthdate/was_born_rules_sample.py
+++ b/examples/birthdate/was_born_rules_sample.py
@@ -0,0 +1,122 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+
														
 
															+from refo import Question, Star, Any, Plus
														
 
															+from iepy.extraction.rules import rule, Token, Pos
														
 
															+
														
 
															+
														
 
															+RELATION = "was born"
														
 
															+
														
 
															+
														
 
															+@rule(True)
														
 
															+def born_date_in_parenthesis(Subject, Object):
														
 
															+    """
														
 
															+    Ex: Gary Sykes (Born 13 February 1984) is a British super featherweight boxer.
														
 
															+    """
														
 
															+    anything = Star(Any())
														
 
															+    born = Star(Pos(":")) + Question(Token("Born") | Token("born")) + Question(Token("c."))
														
 
															+    entity_leftover = Star(Pos("NNP"))
														
 
															+    return Subject + entity_leftover + Pos("-LRB-") + born + Object + Pos("-RRB-") + anything
														
 
															+
														
 
															+
														
 
															+@rule(True)
														
 
															+def born_two_dates_in_parenthesis(Subject, Object):
														
 
															+    """
														
 
															+    Ex: James Cunningham (born 1973 or 1974) is a Canadian stand-up comedian and TV host.
														
 
															+    """
														
 
															+    anything = Star(Any())
														
 
															+    born = Question(Token("Born") | Token("born"))
														
 
															+    entity_leftover = Star(Pos("NNP"))
														
 
															+    subject = Subject + entity_leftover
														
 
															+    or_object = (Object + Token("or") + Pos("CD") |
														
 
															+                 Pos("CD") + Token("or") + Object)
														
 
															+    return subject + Pos("-LRB-") + born + or_object + Pos("-RRB-") + anything
														
 
															+
														
 
															+
														
 
															+@rule(True)
														
 
															+def born_date_and_death_in_parenthesis(Subject, Object):
														
 
															+    """
														
 
															+    Ex: Carl Bridgewater (January 2, 1965 - September 19, 1978) was shot dead
														
 
															+    """
														
 
															+    anything = Star(Any())
														
 
															+    return Subject + Pos("-LRB-") + Object + Token("-") + anything + Pos("-RRB-") + anything
														
 
															+
														
 
															+
														
 
															+@rule(True)
														
 
															+def born_date_and_place_in_parenthesis(Subject, Object):
														
 
															+    """
														
 
															+    Ex: Gary Sykes (Born 13 February 1984) is a British super featherweight boxer.
														
 
															+    """
														
 
															+    anything = Star(Any())
														
 
															+    born = (Token("Born") | Token("born"))
														
 
															+    entity_leftover = Star(Pos("NNP"))
														
 
															+    place = Plus(Pos("NNP") + Question(Token(",")))
														
 
															+    return Subject + entity_leftover + Pos("-LRB-") + born + Object + Token(",") + place + Pos("-RRB-") + anything
														
 
															+
														
 
															+
														
 
															+@rule(True)
														
 
															+def was_born_explicit_mention(Subject, Object):
														
 
															+    """
														
 
															+    Ex: Shamsher M. Chowdhury was born in 1950.
														
 
															+    """
														
 
															+    anything = Star(Any())
														
 
															+    return anything + Subject + Token("was born") + Pos("IN") + Object + anything
														
 
															+
														
 
															+
														
 
															+@rule(True)
														
 
															+def is_born_in(Subject, Object):
														
 
															+    """
														
 
															+    Ex: Xu is born in 1902 or 1903 in a family of farmers in Hubei (China RRB)
														
 
															+    """
														
 
															+    anything = Star(Any())
														
 
															+    return Subject + Token("is born in") + Object + anything
														
 
															+
														
 
															+
														
 
															+@rule(True)
														
 
															+def mentions_real_name(Subject, Object):
														
 
															+    """
														
 
															+    Ex: Harry Pilling, born Ashtonunder-Lyne, Lancashire on 2 February 1943, played ...
														
 
															+    """
														
 
															+    anything = Star(Any())
														
 
															+    real_name = Plus(Pos("NNP") + Question(Token(",")))
														
 
															+    return Subject + Token("born") + real_name + Pos("IN") + Object + anything
														
 
															+
														
 
															+
														
 
															+@rule(True)
														
 
															+def was_born_and_mentions_place(Subject, Object):
														
 
															+    """
														
 
															+    Ex: Nasser Sharify was born in Tehran, Iran, in 1925.
														
 
															+    """
														
 
															+    place = Plus(Pos("NNP") + Question(Token(",")))
														
 
															+    return Subject + Token("was born") + Pos("IN") + place + Pos("IN") + Object + Question(Pos("."))
														
 
															+
														
 
															+
														
 
															+@rule(True)
														
 
															+def was_born_and_mentions_place_2(Subject, Object):
														
 
															+    """
														
 
															+    Ex: Theodone C. Hu was born in 1872 in Huangpu town, Haizhu District, Guangzhou, Guangdong, China.
														
 
															+    """
														
 
															+    anything = Star(Any())
														
 
															+    place = Plus(Pos("NNP") + Question(Token(",")))
														
 
															+    return Subject + Token("was born") + Pos("IN") + Object + Pos("IN") + place + anything
														
 
															+
														
 
															+
														
 
															+@rule(True)
														
 
															+def just_born(Subject, Object):
														
 
															+    """
														
 
															+    Ex: Lyle Eugene Hollister, born 6 July 1923 in Sioux Falls, South Dakota, enlisted in the Navy....
														
 
															+    """
														
 
															+    anything = Star(Any())
														
 
															+    return Subject + Token(", born") + Object + anything
														
 
															+
														
 
															+
														
 
															+## NEGATIVE RULES ##
														
 
															+
														
 
															+@rule(False, priority=1)
														
 
															+def incorrect_labeling_of_place_as_person(Subject, Object):
														
 
															+    """
														
 
															+    Ex:  Sophie Christiane of Wolfstein (24 October 24, 1667 - 23 August 1737)
														
 
															+    Wolfstein is a *place*, not a *person*
														
 
															+    """
														
 
															+    anything = Star(Any())
														
 
															+    person = Plus(Pos("NNP") + Question(Token(",")))
														
 
															+    return anything + person + Token("of") + Subject + anything
														
--- a/examples/credit/__init__.py
+++ b/examples/credit/__init__.py
@@ -0,0 +1 @@
 
															+from . import rules
														
--- a/examples/credit/annotation.conf
+++ b/examples/credit/annotation.conf
@@ -0,0 +1,49 @@
 
															+# -*- Mode: Text; tab-width: 8; indent-tabs-mode: nil; coding: utf-8; -*-
														
 
															+# vim:set ft=conf ts=2 sw=2 sts=2 autoindent:
														
 
															+
														
 
															+# Simple text-based definitions of entity, relation and event types
														
 
															+# and event attributes for the BioNLP Shared Task 2011 EPI task.
														
 
															+
														
 
															+
														
 
															+[entities]
														
 
															+
														
 
															+Protein
														
 
															+	abc
														
 
															+Entity
														
 
															+
														
 
															+
														
 
															+[relations]
														
 
															+
														
 
															+Equiv	Arg1:Protein, Arg2:Protein, <REL-TYPE>:symmetric-transitive
														
 
															+Equiv	Arg1:Entity, Arg2:Entity, <REL-TYPE>:symmetric-transitive
														
 
															+
														
 
															+# (No entity nestings permitted in EPI. Could be defined using special
														
 
															+# relation type ENTITY-NESTING if necessary.)
														
 
															+
														
 
															+
														
 
															+[events]
														
 
															+
														
 
															+Catalysis	Theme:<EVENT>, Cause:Protein
														
 
															+----------------------------------------
														
 
															+DNA_methylation|GO:0006306	Theme:Protein, Site?:Entity
														
 
															+DNA_demethylation|GO:0080111	Theme:Protein, Site?:Entity
														
 
															+----------------------------------------
														
 
															+Acetylation|GO:0006473	Theme:Protein, Site?:Entity, Contextgene?:Protein
														
 
															+Methylation|GO:0006479	Theme:Protein, Site?:Entity, Contextgene?:Protein
														
 
															+Glycosylation|GO:0006486	Theme:Protein, Site?:Entity, Sidechain?:Entity
														
 
															+Hydroxylation|GO:0018126	Theme:Protein, Site?:Entity
														
 
															+Phosphorylation|GO:0006468	Theme:Protein, Site?:Entity
														
 
															+Ubiquitination|GO:0016567	Theme:Protein, Site?:Entity
														
 
															+----------------------------------------
														
 
															+Deacetylation|GO:0006476	Theme:Protein, Site?:Entity, Contextgene?:Protein
														
 
															+Demethylation|GO:0006482	Theme:Protein, Site?:Entity, Contextgene?:Protein
														
 
															+Deglycosylation|GO:0006517	Theme:Protein, Site?:Entity, Sidechain?:Entity
														
 
															+Dehydroxylation|GO:-------	Theme:Protein, Site?:Entity
														
 
															+Dephosphorylation|GO:0006470	Theme:Protein, Site?:Entity
														
 
															+Deubiquitination|GO:0016579	Theme:Protein, Site?:Entity
														
 
															+
														
 
															+
														
 
															+[attributes]
														
 
															+
														
 
															+Negation	Arg:<EVENT>
														
 
															+Speculation	Arg:<EVENT>
														
--- a/examples/credit/articles.csv
+++ b/examples/credit/articles.csv
--- a/examples/credit/bin/csv_to_iepy.py
+++ b/examples/credit/bin/csv_to_iepy.py
@@ -0,0 +1,28 @@
 
															+"""
														
 
															+IEPY database loader from csv file
														
 
															+
														
 
															+Usage:
														
 
															+    csv_to_iepy.py <filename>
														
 
															+    csv_to_iepy.py -h | --help
														
 
															+
														
 
															+The <filename> argument can be a .csv file or a .csv.gz file containing the
														
 
															+corpus in two columns: 'freebase_mid' and 'description'.
														
 
															+
														
 
															+Options:
														
 
															+  -h --help             Show this screen
														
 
															+  --version             Version number
														
 
															+"""
														
 
															+
														
 
															+import logging
														
 
															+
														
 
															+from docopt import docopt
														
 
															+
														
 
															+import iepy
														
 
															+iepy.setup(__file__)
														
 
															+from iepy.utils import csv_to_iepy
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    logging.basicConfig(level=logging.INFO, format='%(message)s')
														
 
															+    opts = docopt(__doc__, version=iepy.__version__)
														
 
															+    filepath = opts["<filename>"]
														
 
															+    csv_to_iepy(filepath)
														
--- a/examples/credit/bin/gazettes_loader.py
+++ b/examples/credit/bin/gazettes_loader.py
@@ -0,0 +1,76 @@
 
															+"""
														
 
															+IEPY gazettes loader
														
 
															+
														
 
															+Usage:
														
 
															+    gazettes_loader.py <filename>
														
 
															+
														
 
															+
														
 
															+The <filename> argument can be a .csv file or a .csv.gz file containing the
														
 
															+gazettes in two columns: 'literal' and 'class'.
														
 
															+
														
 
															+
														
 
															+Options:
														
 
															+  -h --help             Show this screen
														
 
															+"""
														
 
															+
														
 
															+import sys
														
 
															+import csv
														
 
															+import gzip
														
 
															+import logging
														
 
															+from operator import itemgetter
														
 
															+
														
 
															+from django.db import IntegrityError
														
 
															+from docopt import docopt
														
 
															+
														
 
															+import iepy
														
 
															+iepy.setup(__file__)
														
 
															+from iepy.data.models import EntityKind, GazetteItem
														
 
															+
														
 
															+logging.basicConfig(level=logging.INFO, format='%(message)s')
														
 
															+
														
 
															+
														
 
															+def add_gazettes_from_csv(filepath):
														
 
															+    if filepath.endswith(".gz"):
														
 
															+        fin = gzip.open(filepath, "rt")
														
 
															+    else:
														
 
															+        fin = open(filepath, "rt")
														
 
															+    reader = csv.DictReader(fin)
														
 
															+
														
 
															+    expected_fnames = ['literal', 'class']
														
 
															+    if not set(reader.fieldnames).issuperset(expected_fnames):
														
 
															+        msg = "Couldn't find the expected field names on the provided csv: {}"
														
 
															+        sys.exit(msg.format(expected_fnames))
														
 
															+
														
 
															+    _create_gazette_entries(
														
 
															+        itemgetter(*expected_fnames)(line) for line in reader
														
 
															+    )
														
 
															+
														
 
															+
														
 
															+def _create_gazette_entries(entries_list):
														
 
															+    kind_cache = {}
														
 
															+    created = 0
														
 
															+    for literal, kind_name in entries_list:
														
 
															+        literal = literal.strip()
														
 
															+        kind_name = kind_name.strip()
														
 
															+        kind = kind_cache.get(kind_name)
														
 
															+        if kind is None:
														
 
															+            kind, _ = EntityKind.objects.get_or_create(name=kind_name)
														
 
															+            kind_cache[kind_name] = kind
														
 
															+        gazette = GazetteItem(text=literal, kind=kind)
														
 
															+
														
 
															+        try:
														
 
															+            gazette.save()
														
 
															+        except IntegrityError as error:
														
 
															+            logging.warn(
														
 
															+                "Gazette '{}' of class '{}' not loaded, literal already existed".format(
														
 
															+                literal, kind_name))
														
 
															+            print(error)
														
 
															+        finally:
														
 
															+            created += 1
														
 
															+    print('Created {} new gazette items'.format(created))
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    opts = docopt(__doc__, version=iepy.__version__)
														
 
															+    fname = opts["<filename>"]
														
 
															+    add_gazettes_from_csv(fname)
														
--- a/examples/credit/bin/iepy_rules_runner.py
+++ b/examples/credit/bin/iepy_rules_runner.py
@@ -0,0 +1,59 @@
 
															+"""
														
 
															+Run IEPY rule-based extractor
														
 
															+
														
 
															+Usage:
														
 
															+    iepy_rules_runner.py
														
 
															+    iepy_rules_runner.py -h | --help | --version
														
 
															+
														
 
															+Picks from rules.py the relation to work with, and the rules definitions and
														
 
															+proceeds with the extraction.
														
 
															+
														
 
															+Options:
														
 
															+  -h --help             Show this screen
														
 
															+  --version             Version number
														
 
															+"""
														
 
															+import sys
														
 
															+import logging
														
 
															+
														
 
															+from django.core.exceptions import ObjectDoesNotExist
														
 
															+
														
 
															+import iepy
														
 
															+iepy.setup(__file__)
														
 
															+
														
 
															+from iepy.extraction.rules import load_rules
														
 
															+from iepy.extraction.rules_core import RuleBasedCore
														
 
															+from iepy.data import models, output
														
 
															+from iepy.data.db import CandidateEvidenceManager
														
 
															+
														
 
															+
														
 
															+def run_from_command_line():
														
 
															+    logging.basicConfig(level=logging.INFO, format='%(message)s')
														
 
															+
														
 
															+    try:
														
 
															+        relation_name = iepy.instance.rules.RELATION
														
 
															+    except AttributeError:
														
 
															+        logging.error("RELATION not defined in rules file")
														
 
															+        sys.exit(1)
														
 
															+
														
 
															+    try:
														
 
															+        relation = models.Relation.objects.get(name=relation_name)
														
 
															+    except ObjectDoesNotExist:
														
 
															+        logging.error("Relation {!r} not found".format(relation_name))
														
 
															+        sys.exit(1)
														
 
															+
														
 
															+    # Load rules
														
 
															+    rules = load_rules()
														
 
															+
														
 
															+    # Load evidences
														
 
															+    evidences = CandidateEvidenceManager.candidates_for_relation(relation)
														
 
															+
														
 
															+    # Run the pipeline
														
 
															+    iextractor = RuleBasedCore(relation, rules)
														
 
															+    iextractor.start()
														
 
															+    iextractor.process()
														
 
															+    predictions = iextractor.predict(evidences)
														
 
															+    output.dump_output_loop(predictions)
														
 
															+
														
 
															+
														
 
															+if __name__ == u'__main__':
														
 
															+    run_from_command_line()
														
--- a/examples/credit/bin/iepy_runner.py
+++ b/examples/credit/bin/iepy_runner.py
@@ -0,0 +1,184 @@
 
															+"""
														
 
															+Run IEPY active-learning extractor
														
 
															+
														
 
															+Usage:
														
 
															+    iepy_runner.py [options] <relation_name> <output>
														
 
															+    iepy_runner.py [options] --db-store <relation_name>
														
 
															+    iepy_runner.py -h | --help | --version
														
 
															+
														
 
															+Options:
														
 
															+  --store-extractor=<extractor_output>     Stores the trained classifier
														
 
															+  --trained-extractor=<extractor_path>     Load an already trained extractor
														
 
															+  --db-store                               Stores the predictions on the database
														
 
															+  --no-questions                           Won't generate questions to answer. Will predict
														
 
															+                                           as is. Should be used with --trained-extractor
														
 
															+  --tune-for=<tune-for>                    Predictions tuning. Options are high-prec
														
 
															+                                           or high-recall [default: high-prec]
														
 
															+  --extractor-config=<config.json>         Sets the extractor config
														
 
															+  --version                                Version number
														
 
															+  -h --help                                Show this screen
														
 
															+"""
														
 
															+
														
 
															+import os
														
 
															+import json
														
 
															+import logging
														
 
															+from docopt import docopt
														
 
															+from sys import exit
														
 
															+
														
 
															+import iepy
														
 
															+INSTANCE_PATH = iepy.setup(__file__)
														
 
															+
														
 
															+from iepy.extraction.active_learning_core import ActiveLearningCore, HIPREC, HIREC
														
 
															+from iepy.data.db import CandidateEvidenceManager
														
 
															+from iepy.data.models import Relation
														
 
															+from iepy.extraction.terminal import TerminalAdministration
														
 
															+from iepy.data import output
														
 
															+
														
 
															+
														
 
															+def print_all_relations():
														
 
															+    print("All available relations:")
														
 
															+    for relation in Relation.objects.all():
														
 
															+        print("  {}".format(relation))
														
 
															+
														
 
															+
														
 
															+def load_labeled_evidences(relation, evidences):
														
 
															+    CEM = CandidateEvidenceManager  # shorcut
														
 
															+    return CEM.labels_for(relation, evidences, CEM.conflict_resolution_newest_wins)
														
 
															+
														
 
															+
														
 
															+def _get_tuning_mode(opts):
														
 
															+    if opts['--tune-for'] == 'high-prec':
														
 
															+        tuning_mode = HIPREC
														
 
															+    elif opts['--tune-for'] == 'high-recall':
														
 
															+        tuning_mode = HIREC
														
 
															+    else:
														
 
															+        print ('Invalid tuning mode')
														
 
															+        print (__doc__)
														
 
															+        exit(1)
														
 
															+    return tuning_mode
														
 
															+
														
 
															+
														
 
															+def _get_relation(opts):
														
 
															+    relation_name = opts['<relation_name>']
														
 
															+    try:
														
 
															+        relation = Relation.objects.get(name=relation_name)
														
 
															+    except Relation.DoesNotExist:
														
 
															+        print("Relation {!r} non existent".format(relation_name))
														
 
															+        print_all_relations()
														
 
															+        exit(1)
														
 
															+    return relation
														
 
															+
														
 
															+
														
 
															+def _load_extractor(opts, relation, labeled_evidences):
														
 
															+    extractor_path = opts.get('--trained-extractor')
														
 
															+    try:
														
 
															+        iextractor = ActiveLearningCore.load(extractor_path,
														
 
															+                                             labeled_evidences=labeled_evidences)
														
 
															+    except ValueError:
														
 
															+        print("Error: unable to load extractor, invalid file")
														
 
															+        exit(1)
														
 
															+
														
 
															+    if iextractor.relation != relation:
														
 
															+        print('The loaded extractor is not for the requested relation'
														
 
															+              ' but for relation {} instead'.format(iextractor.relation))
														
 
															+        exit(1)
														
 
															+    print('Extractor successfully loaded')
														
 
															+    return iextractor
														
 
															+
														
 
															+
														
 
															+def _construct_extractor(opts, relation, labeled_evidences, tuning_mode):
														
 
															+    config_filepath = opts.get("--extractor-config")
														
 
															+    if not config_filepath:
														
 
															+        config_filepath = os.path.join(INSTANCE_PATH, "extractor_config.json")
														
 
															+
														
 
															+    if not os.path.exists(config_filepath):
														
 
															+        print("Error: extractor config does not exists, please create the "
														
 
															+              "file extractor_config.json or use the --extractor-config")
														
 
															+        exit(1)
														
 
															+
														
 
															+    with open(config_filepath) as filehandler:
														
 
															+        try:
														
 
															+            extractor_config = json.load(filehandler)
														
 
															+        except Exception as error:
														
 
															+            print("Error: unable to load extractor config: {}".format(error))
														
 
															+            exit(1)
														
 
															+
														
 
															+    iextractor = ActiveLearningCore(
														
 
															+        relation, labeled_evidences, extractor_config, tradeoff=tuning_mode
														
 
															+    )
														
 
															+    return iextractor
														
 
															+
														
 
															+
														
 
															+def run_from_command_line():
														
 
															+    opts = docopt(__doc__, version=iepy.__version__)
														
 
															+
														
 
															+    logging.basicConfig(level=logging.INFO, format='%(message)s')
														
 
															+    logging.getLogger("featureforge").setLevel(logging.WARN)
														
 
															+
														
 
															+    tuning_mode = _get_tuning_mode(opts)
														
 
															+    relation = _get_relation(opts)
														
 
															+
														
 
															+    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
														
 
															+    labeled_evidences = load_labeled_evidences(relation, candidates)
														
 
															+
														
 
															+    if opts.get('--trained-extractor'):
														
 
															+        iextractor = _load_extractor(opts, relation, labeled_evidences)
														
 
															+        was_ever_trained = True
														
 
															+        opts["--no-questions"] = True
														
 
															+    else:
														
 
															+        iextractor = _construct_extractor(opts, relation, labeled_evidences, tuning_mode)
														
 
															+        iextractor.start()
														
 
															+        was_ever_trained = False
														
 
															+
														
 
															+    if not opts.get("--no-questions", False):
														
 
															+        questions_loop(iextractor, relation, was_ever_trained)
														
 
															+
														
 
															+    # Candidates generator was consumed when generating labeled_evidences, so we'll
														
 
															+    # define it fresh again
														
 
															+    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
														
 
															+    # Predict and store output
														
 
															+    predictions = iextractor.predict(candidates)  # asking predictions for EVERYTHING
														
 
															+    if not predictions:
														
 
															+        print("Nothing was predicted")
														
 
															+        exit(1)
														
 
															+
														
 
															+    if opts.get("--db-store"):
														
 
															+        output.dump_predictions_to_database(relation, predictions)
														
 
															+
														
 
															+    output_file = opts.get("<output>")
														
 
															+    if output_file:
														
 
															+        output.dump_runner_output_to_csv(predictions, output_file)
														
 
															+
														
 
															+    classifier_output = opts.get("--store-extractor")
														
 
															+    if classifier_output:
														
 
															+        iextractor.save(classifier_output)
														
 
															+
														
 
															+
														
 
															+def questions_loop(iextractor, relation, was_ever_trained):
														
 
															+    STOP = u'STOP'
														
 
															+    term = TerminalAdministration(
														
 
															+        relation,
														
 
															+        extra_options=[(STOP, u'Stop execution')]
														
 
															+    )
														
 
															+    while iextractor.questions:
														
 
															+        questions = list(iextractor.questions)  # copying the list
														
 
															+        term.update_candidate_evidences_to_label(questions)
														
 
															+        result = term()
														
 
															+        i = 0
														
 
															+        for c, label_value in load_labeled_evidences(relation, questions).items():
														
 
															+            if label_value is not None:
														
 
															+                iextractor.add_answer(c, label_value)
														
 
															+                i += 1
														
 
															+        print ('Added %s new human labels to the extractor core' % i)
														
 
															+        iextractor.process()
														
 
															+        was_ever_trained = True
														
 
															+        if result == STOP:
														
 
															+            break
														
 
															+
														
 
															+    if not was_ever_trained:
														
 
															+        # It's needed to run some process before asking for predictions
														
 
															+        iextractor.process()
														
 
															+
														
 
															+
														
 
															+if __name__ == u'__main__':
														
 
															+    run_from_command_line()
														
--- a/examples/credit/bin/manage.py
+++ b/examples/credit/bin/manage.py
@@ -0,0 +1,12 @@
 
															+#!/usr/bin/env python
														
 
															+
														
 
															+import sys
														
 
															+
														
 
															+from django.core.management import execute_from_command_line
														
 
															+
														
 
															+import iepy
														
 
															+iepy.setup(__file__)
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    execute_from_command_line(sys.argv)
														
--- a/examples/credit/bin/preprocess.py
+++ b/examples/credit/bin/preprocess.py
@@ -0,0 +1,96 @@
 
															+"""
														
 
															+Corpus preprocessing script
														
 
															+
														
 
															+Usage:
														
 
															+    preprocess.py [options]
														
 
															+    preprocess.py --split-in=<num-splits> --run-part=<num-part>
														
 
															+    preprocess.py --increment-ner
														
 
															+    preprocess.py -h | --help | --version
														
 
															+
														
 
															+Options:
														
 
															+  -h --help                      Show this screen
														
 
															+  --multiple-cores=<num-cores>   Number of cores (use all to use every processor)
														
 
															+  --increment-ner                Re run NER and Gazetter for every document. If a document lacked any of the previous steps, will be preprocessed entirely.
														
 
															+  --version                      Version number
														
 
															+"""
														
 
															+import logging
														
 
															+
														
 
															+from docopt import docopt
														
 
															+
														
 
															+import os
														
 
															+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
														
 
															+os.environ["CUDA_VISIBLE_DEVICES"] = ""
														
 
															+
														
 
															+import iepy
														
 
															+import multiprocessing
														
 
															+iepy.setup(__file__)
														
 
															+from iepy.data.db import DocumentManager
														
 
															+from iepy.selfpreprocess.self_preprocess import SelfPreprocesser
														
 
															+from iepy.selfpreprocess.pipeline import PreProcessPipeline, PreProcessSteps
														
 
															+# from iepy.preprocess.stanford_preprocess import StanfordPreprocess
														
 
															+# from iepy.preprocess.pipeline import PreProcessPipeline, PreProcessSteps
														
 
															+# from iepy.preprocess.segmenter import SyntacticSegmenterRunner
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+class ParallelDocManager(DocumentManager):
														
 
															+
														
 
															+    def mines_of(self, qset, number_of_processors, my_id):
														
 
															+        K = number_of_processors
														
 
															+        N = my_id
														
 
															+        clause = 'id %%%% %s = %s' % (K, N)
														
 
															+        return qset.extra(where=[clause])
														
 
															+
														
 
															+def start_preprocess(docs, increment_ner):
														
 
															+    pipeline = PreProcessPipeline([
														
 
															+        SelfPreprocesser(increment_ner),
														
 
															+        # SyntacticSegmenterRunner(increment=True)
														
 
															+    ], docs)
														
 
															+    pipeline.process_everything()
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    logger = logging.getLogger(u'preprocess')
														
 
															+    logger.setLevel(logging.INFO)
														
 
															+    logging.basicConfig(level=logging.INFO, format='%(message)s')
														
 
															+    opts = docopt(__doc__, version=iepy.__version__)
														
 
															+    increment_ner = opts['--increment-ner']
														
 
															+
														
 
															+    dm = ParallelDocManager()
														
 
															+    all_docs = dm.get_documents_lacking_preprocess(
														
 
															+        [PreProcessSteps.brat])
														
 
															+
														
 
															+    multiple_cores = opts.get('--multiple-cores')
														
 
															+    split_in = opts.get("--split-in")
														
 
															+    run_part = opts.get("--run-part")
														
 
															+
														
 
															+    if multiple_cores:
														
 
															+        if multiple_cores == "all":
														
 
															+            multiple_cores = multiprocessing.cpu_count()
														
 
															+        try:
														
 
															+            multiple_cores = int(multiple_cores)
														
 
															+        except ValueError:
														
 
															+            logger.error("Invalid number of cores")
														
 
															+            exit(1)
														
 
															+
														
 
															+        for i in range(multiple_cores):
														
 
															+            process = multiprocessing.Process(
														
 
															+                target=start_preprocess, args=(dm.mines_of(all_docs, multiple_cores, i), increment_ner)
														
 
															+            )
														
 
															+            process.start()
														
 
															+    elif split_in:
														
 
															+        try:
														
 
															+            split_in = int(split_in)
														
 
															+            run_part = int(run_part) - 1
														
 
															+        except ValueError:
														
 
															+            logger.error("Invalid split")
														
 
															+            exit(1)
														
 
															+
														
 
															+        if run_part < 0 or run_part > split_in:
														
 
															+            logger.error("Parts must be between 1 and {}".format(split_in))
														
 
															+            exit(1)
														
 
															+
														
 
															+        docs = dm.mines_of(all_docs, split_in, run_part)
														
 
															+        start_preprocess(docs, increment_ner)
														
 
															+    else:
														
 
															+        start_preprocess(all_docs, increment_ner)
														
--- a/examples/credit/bin/rules_verifier.py
+++ b/examples/credit/bin/rules_verifier.py
@@ -0,0 +1,149 @@
 
															+"""
														
 
															+IEPY rules verifier
														
 
															+
														
 
															+
														
 
															+Usage:
														
 
															+    rules_verifier.py <relation> [options]
														
 
															+
														
 
															+Options:
														
 
															+  --shuffle             Chooses the sample randomly and not the first ones
														
 
															+  --create-evidences    Creates evidences that are missing [default: false]
														
 
															+  -r --rule=<rule>      Tests only this rule
														
 
															+  -l --limit=<limit>    Limits the amount of evidences uses
														
 
															+  -h --help             Show this screen
														
 
															+"""
														
 
															+
														
 
															+import sys
														
 
															+import logging
														
 
															+from docopt import docopt
														
 
															+
														
 
															+import refo
														
 
															+from django.core.exceptions import ObjectDoesNotExist
														
 
															+from colorama import init as colorama_init
														
 
															+
														
 
															+import iepy
														
 
															+iepy.setup(__file__)
														
 
															+
														
 
															+from iepy.data import models
														
 
															+from iepy.data.models import EvidenceCandidate
														
 
															+from iepy.data.db import CandidateEvidenceManager
														
 
															+from iepy.extraction.terminal import TerminalEvidenceFormatter
														
 
															+from iepy.extraction.rules import (
														
 
															+    load_rules, compile_rule, generate_tokens_to_match
														
 
															+)
														
 
															+from iepy.metrics import result_dict_from_predictions
														
 
															+
														
 
															+
														
 
															+logging.basicConfig(level=logging.INFO, format='%(message)s')
														
 
															+
														
 
															+
														
 
															+def run_from_command_line():
														
 
															+    opts = docopt(__doc__, version=iepy.__version__)
														
 
															+    relation_name = opts.get("<relation>")
														
 
															+    limit = opts.get("--limit")
														
 
															+    rule_name = opts.get("--rule")
														
 
															+    shuffle = opts.get("--shuffle")
														
 
															+    create_evidences = opts.get("--create-evidences")
														
 
															+
														
 
															+    if limit is None:
														
 
															+        limit = -1
														
 
															+
														
 
															+    try:
														
 
															+        limit = int(limit)
														
 
															+    except ValueError:
														
 
															+        logging.error("Invalid limit value, it must be a number")
														
 
															+        sys.exit(1)
														
 
															+
														
 
															+    try:
														
 
															+        relation = models.Relation.objects.get(name=relation_name)
														
 
															+    except ObjectDoesNotExist:
														
 
															+        logging.error("Relation {!r} not found".format(relation_name))
														
 
															+        sys.exit(1)
														
 
															+
														
 
															+    # Load rules
														
 
															+    rules = get_rules(rule_name)
														
 
															+    rule_regexes = [
														
 
															+        (rule.__name__, compile_rule(rule, relation), rule.answer) for rule in rules
														
 
															+    ]
														
 
															+
														
 
															+    # Load evidences
														
 
															+    if EvidenceCandidate.objects.all().count() == 0:
														
 
															+        create_evidences = True
														
 
															+    evidences = CandidateEvidenceManager.candidates_for_relation(
														
 
															+        relation, create_evidences, seg_limit=limit, shuffle_segs=shuffle
														
 
															+    )
														
 
															+    conflict_solver = CandidateEvidenceManager.conflict_resolution_newest_wins
														
 
															+    answers = CandidateEvidenceManager.labels_for(
														
 
															+        relation, evidences, conflict_solver
														
 
															+    )
														
 
															+    run_tests(rule_regexes, evidences, answers)
														
 
															+
														
 
															+
														
 
															+def run_tests(rule_regexes, evidences, answers):
														
 
															+    predictions = []
														
 
															+    real_labels = []
														
 
															+    evidences_with_labels = []
														
 
															+
														
 
															+    colorama_init()
														
 
															+    formatter = TerminalEvidenceFormatter()
														
 
															+
														
 
															+    for name, regex, answer in rule_regexes:
														
 
															+        title = "Matches for rule '{}' (value: {})".format(name, answer)
														
 
															+        print("\n{}\n{}".format(title, "-" * len(title)))
														
 
															+
														
 
															+        anything_matched = False
														
 
															+        for evidence in evidences:
														
 
															+            tokens_to_match = generate_tokens_to_match(evidence)
														
 
															+            match = refo.match(regex, tokens_to_match)
														
 
															+
														
 
															+            if match:
														
 
															+                anything_matched = True
														
 
															+                print("  * {}".format(formatter.colored_text(evidence)))
														
 
															+
														
 
															+            if evidence in answers and answers[evidence] is not None:
														
 
															+                evidences_with_labels.append(evidence)
														
 
															+                real_labels.append(answers[evidence])
														
 
															+
														
 
															+                if match:
														
 
															+                    predictions.append(answer)
														
 
															+                else:
														
 
															+                    predictions.append(False)
														
 
															+
														
 
															+        if not anything_matched:
														
 
															+            print("  nothing matched")
														
 
															+
														
 
															+        print()
														
 
															+
														
 
															+    if real_labels:
														
 
															+        results = result_dict_from_predictions(
														
 
															+            evidences_with_labels, real_labels, predictions
														
 
															+        )
														
 
															+        results.pop("end_time")
														
 
															+        keys = [
														
 
															+            "true_positives", "true_negatives",
														
 
															+            "false_positives", "false_negatives",
														
 
															+            "precision", "recall",
														
 
															+            "accuracy", "f1",
														
 
															+        ]
														
 
															+
														
 
															+        title = "Metrics"
														
 
															+        print("{}\n{}".format(title, "-" * len(title)))
														
 
															+        for key in keys:
														
 
															+            print("{:>15}: {:.2f}".format(key, results[key]))
														
 
															+
														
 
															+
														
 
															+def get_rules(rule_name):
														
 
															+    # Load rules
														
 
															+    rules = load_rules()
														
 
															+
														
 
															+    if rule_name:
														
 
															+        rules = [x for x in rules if x.__name__ == rule_name]
														
 
															+        if not rules:
														
 
															+            logging.error("rule '{}' does not exists".format(rule_name))
														
 
															+            sys.exit(1)
														
 
															+
														
 
															+    return rules
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    run_from_command_line()
														
--- a/examples/credit/bin/settlement.py
+++ b/examples/credit/bin/settlement.py
@@ -0,0 +1,241 @@
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+from django.db.models import Q
														
 
															+import datetime,time
														
 
															+import iepy
														
 
															+iepy.setup(__file__)
														
 
															+from iepy.data.db import DocumentManager
														
 
															+from iepy.data.models import IEDocument,LabeledIEDocument,IEDocumentMetadata,LabeledIEDocumentMetadata,Payroll
														
 
															+from brat.models import BratAnnotation,LabeledBratAnnotation
														
 
															+from django.db import transaction
														
 
															+import pandas as pd
														
 
															+from django.contrib.auth.models import User
														
 
															+
														
 
															+def object_to_dict(obj,class_model):
														
 
															+    '''
														
 
															+    :param obj:对象
														
 
															+    :param class_model:django model
														
 
															+    :return: 由对象生成的键值对
														
 
															+    '''
														
 
															+    _dict = {}
														
 
															+    concrete_model = class_model._meta.concrete_model
														
 
															+    for field in concrete_model._meta.local_fields:
														
 
															+        value = field.value_from_object(obj)
														
 
															+        _dict[field.name] = value
														
 
															+    return _dict
														
 
															+
														
 
															+
														
 
															+class Settlement():
														
 
															+
														
 
															+    '''
														
 
															+    @summary: 结算类，定义了结算者所需要执行的各种方法
														
 
															+    '''
														
 
															+
														
 
															+    def makePayroll(self,_user,time_begin,time_end):
														
 
															+        '''
														
 
															+        :param _user: 用户名
														
 
															+        :param time_begin: 起始时间
														
 
															+        :param time_end: 截至时间
														
 
															+        :return:根据用户，时间段生成用户的标注情况
														
 
															+        '''
														
 
															+        from django.db import connection
														
 
															+        with transaction.atomic():
														
 
															+            cursor = connection.cursor()
														
 
															+            sql = " select count(1) from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>='%s'"%(_user,time_end,time_begin)
														
 
															+            cursor.execute(sql)
														
 
															+            doc_count = cursor.fetchall()[0][0]
														
 
															+            sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>='%s' order by edittime desc limit 1200) and value like '%s' "%(_user,time_end,time_begin,"T%")
														
 
															+            cursor.execute(sql)
														
 
															+            t_count = cursor.fetchall()[0][0]
														
 
															+            sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>='%s' order by edittime desc limit 1200) and value like '%s' "%(_user,time_end,time_begin,"R%")
														
 
															+            cursor.execute(sql)
														
 
															+            r_count = cursor.fetchall()[0][0]
														
 
															+            sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>='%s') and value like '%s' "%(_user,time_end,time_begin,"T%")
														
 
															+            cursor.execute(sql)
														
 
															+            all_t_count = cursor.fetchall()[0][0]
														
 
															+            sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>='%s') and value like '%s' "%(_user,time_end,time_begin,"R%")
														
 
															+            cursor.execute(sql)
														
 
															+            all_r_count = cursor.fetchall()[0][0]
														
 
															+            wage = round(0.03*t_count+0.05*r_count+(all_t_count-t_count)*0.04+(all_r_count-r_count)*0.06,2)
														
 
															+            print(doc_count,t_count,r_count,wage)
														
 
															+            payrolls = Payroll.objects.filter(Q(user=_user)& Q(begin_time=time_begin) & Q(end_time=time_end))
														
 
															+            if len(payrolls)==0:
														
 
															+                _payroll = Payroll.objects.create(**{"user":_user,"doc_count":doc_count,"begin_time":time_begin,"end_time":time_end,"t_count":all_t_count,"r_count":all_r_count,"wage":wage})
														
 
															+                _payroll.save()
														
 
															+            else:
														
 
															+                _payroll = payrolls[0]
														
 
															+                _payroll.doc_count = doc_count
														
 
															+                _payroll.t_count = all_t_count
														
 
															+                _payroll.r_count = all_r_count
														
 
															+                _payroll.wage = wage
														
 
															+                _payroll.save()
														
 
															+
														
 
															+    def exportPayroll(self,begin_time,end_time):
														
 
															+        '''
														
 
															+        :param begin_time: 导出开始时间
														
 
															+        :param end_time: 导出结束时间
														
 
															+        :return:
														
 
															+        '''
														
 
															+        list_user = []
														
 
															+        list_doc_count = []
														
 
															+        list_t_count = []
														
 
															+        list_r_count = []
														
 
															+        list_wage = []
														
 
															+        list_yield = []
														
 
															+        list_account = []
														
 
															+        list_begin = []
														
 
															+        list_end = []
														
 
															+        if begin_time is not None:
														
 
															+            payrolls = Payroll.objects.filter(Q(begin_time=begin_time) & Q(end_time=end_time))
														
 
															+        else:
														
 
															+            payrolls = Payroll.objects.filter(Q(end_time=end_time))
														
 
															+        for _payroll in payrolls:
														
 
															+            list_user.append(_payroll.user)
														
 
															+            list_doc_count.append(_payroll.doc_count)
														
 
															+            list_t_count.append(_payroll.t_count)
														
 
															+            list_r_count.append(_payroll.r_count)
														
 
															+            list_wage.append(_payroll.wage)
														
 
															+            list_yield.append(_payroll._yield)
														
 
															+            list_account.append(_payroll.account)
														
 
															+            list_begin.append(_payroll.begin_time)
														
 
															+            list_end.append(_payroll.end_time)
														
 
															+        df = pd.DataFrame({"用户":list_user,"开始时间":list_begin,"结束时间":list_end,"文章数":list_doc_count,"要素数":list_t_count,"关系数":list_r_count,"总价":list_wage,"合格率":list_yield,"结算价":list_account})
														
 
															+        df.to_excel("%s-%s要素标注统计.xls"%(begin_time,end_time),columns=["用户","开始时间","结束时间","文章数","要素数","关系数","总价","合格率","结算价"])
														
 
															+
														
 
															+    def getAllUser(self):
														
 
															+        from django.db import connection
														
 
															+        with transaction.atomic():
														
 
															+            list_user = []
														
 
															+            cursor = connection.cursor()
														
 
															+            sql = "select username from auth_user where is_staff='t'"
														
 
															+            cursor.execute(sql)
														
 
															+            for row in cursor.fetchall():
														
 
															+                list_user.append(row[0])
														
 
															+            return list_user
														
 
															+
														
 
															+
														
 
															+    def makeMigrate(self,_user,time_begin,time_end):
														
 
															+        '''
														
 
															+        :param _user: 用户名
														
 
															+        :param time_begin: 起始时间
														
 
															+        :param time_end: 截至时间
														
 
															+        :return: 将用户在时间段内的数据迁移到标准表中
														
 
															+        '''
														
 
															+        pass
														
 
															+        # from django.db import connection
														
 
															+        # with transaction.atomic():
														
 
															+        #     cursor = connection.cursor()
														
 
															+        #     sql = " select human_identifier,offsets_to_text,sentences from corpus_iedocument where edituser is null"
														
 
															+        #     cursor.execute(sql)
														
 
															+        #     cursor1 = connection.cursor()
														
 
															+        #     _index = 0
														
 
															+        #     rows = True
														
 
															+        #     while(rows):
														
 
															+        #         rows=cursor.fetchmany(1000)
														
 
															+        #         for row in rows:
														
 
															+        #             _index += 1
														
 
															+        #             print(_index)
														
 
															+        #             human_identifier,offsets_to_text,sentences = row
														
 
															+        #             if sentences!="[]":
														
 
															+        #                 _off = offsets_to_text.split(", ")[-1][:-1]
														
 
															+        #                 _sen = sentences.split(", ")[-1][:-1]
														
 
															+        #                 print(_off,_sen)
														
 
															+        #                 if int(_off)!=int(_sen):
														
 
															+        #                     offsets_to_text = offsets_to_text[:-1]+", "+str(int(_sen))+"]"
														
 
															+        #                     print(offsets_to_text)
														
 
															+        #                     cursor1.execute("update corpus_iedocument set offsets_to_text='%s' where human_identifier='%s'"%(offsets_to_text,human_identifier))
														
 
															+
														
 
															+
														
 
															+
														
 
															+            # ieDocuments = IEDocument.objects.filter(Q(edituser=_user) & Q(edittime__range=(time_begin,time_end)))
														
 
															+            # for obj in ieDocuments:
														
 
															+            #     _dict = object_to_dict(obj,IEDocument)
														
 
															+            #     _dict_meta = object_to_dict(obj.metadata,IEDocumentMetadata)
														
 
															+            #     labeledMeta = LabeledIEDocumentMetadata.objects.create(**_dict_meta)
														
 
															+            #     labeledMeta.save()
														
 
															+            #     _dict["metadata"] = labeledMeta
														
 
															+            #     tmp = LabeledIEDocument.objects.create(**_dict)
														
 
															+            #     tmp.save()
														
 
															+            #
														
 
															+            #     bratAnnotations = BratAnnotation.objects.filter(Q(document_id=obj.human_identifier))
														
 
															+            #     for ann in bratAnnotations:
														
 
															+            #         _dict_ann = object_to_dict(ann,BratAnnotation)
														
 
															+            #         labeledAnn = LabeledBratAnnotation.objects.create(**_dict_ann)
														
 
															+            #         labeledAnn.save()
														
 
															+
														
 
															+
														
 
															+    def getPercentOfPass(self,_user,time_begin,time_end):
														
 
															+        '''
														
 
															+        :param _user:用户名
														
 
															+        :param time_begin: 起始时间
														
 
															+        :param time_end: 截至时间
														
 
															+        :return: 获得用户在时间段内标注数据的合格率
														
 
															+        '''
														
 
															+
														
 
															+    def makePayrolls(self,time_begin,time_end):
														
 
															+        '''
														
 
															+        :param time_begin:起始时间
														
 
															+        :param time_end: 截至时间
														
 
															+        :return: 获得所有用户的工资表
														
 
															+        '''
														
 
															+        for _user in self.getAllUser():
														
 
															+            self.makePayroll(_user,time_begin,time_end)
														
 
															+        self.exportPayroll(time_begin,time_end)
														
 
															+
														
 
															+    def createUser_batch(self,batch_size=90):
														
 
															+        '''
														
 
															+        :param batch_size: 用户个数
														
 
															+        :return:
														
 
															+        '''
														
 
															+        list_user = [User.objects.create_user(username="bidi%d"%(i+1),password="bidi%d"%(i+1)) for i in range(batch_size)]
														
 
															+
														
 
															+    def exportLabels(self):
														
 
															+        groups = [[1,7],[8,14],[15,22],[23,29],[30,36],[37,43],[44,50],[51,56],[57,62],[63,71]]
														
 
															+        from django.db import connection
														
 
															+        cursor = connection.cursor()
														
 
															+        for _i in range(len(groups)):
														
 
															+            _begin,_end = groups[_i]
														
 
															+            list_username = []
														
 
															+            list_user = []
														
 
															+            list_label = []
														
 
															+            list_time = []
														
 
															+            for _j in range(_begin,_end+1):
														
 
															+                username = "bidi%d"%_j
														
 
															+                list_username.append("'%s'"%username)
														
 
															+            sql = " select edituser,human_identifier,to_char(edittime,'yyyy-mm-dd') from corpus_iedocument where edituser in(%s) order by edittime asc"%(",".join(list_username))
														
 
															+            print(sql)
														
 
															+            cursor.execute(sql)
														
 
															+            rows = cursor.fetchall()
														
 
															+            for row in rows:
														
 
															+                list_user.append(row[0])
														
 
															+                list_label.append(row[1])
														
 
															+                list_time.append(row[2])
														
 
															+            df = pd.DataFrame({"时间":list_time,"用户":list_user,"文章编号":list_label})
														
 
															+            df.to_excel("分组_%d.xls"%(_i+1),columns=["时间","用户","文章编号"])
														
 
															+
														
 
															+    def filter(self):
														
 
															+        '''
														
 
															+        过滤拍卖公告
														
 
															+        :return:
														
 
															+        '''
														
 
															+        import re
														
 
															+        ieDocuments = IEDocument.objects.all()
														
 
															+        for obj in ieDocuments:
														
 
															+            if re.search("拍卖",obj.text) is not None:
														
 
															+                obj.jump_signal = 1
														
 
															+                obj.save()
														
 
															+                print(obj.human_identifier)
														
 
															+
														
 
															+
														
 
															+
														
 
															+if __name__=="__main__":
														
 
															+    settle = Settlement()
														
 
															+    # settle.makeMigrate("test","2020-08-01","2020-08-31")
														
 
															+    # settle.makePayroll("test17","2020-08-01","2020-10-31")
														
 
															+    # settle.makePayrolls("2020-08-01","2020-08-31")
														
 
															+    settle.exportPayroll(begin_time=None,end_time='2020-10-31')
														
 
															+    # settle.createUser_batch(batch_size=102)
														
 
															+    # settle.exportLabels()
														
 
															+    # settle.filter()
														
--- a/examples/credit/extractor_config.json
+++ b/examples/credit/extractor_config.json
@@ -0,0 +1,20 @@
 
															+{
														
 
															+    "sparse_features": [
														
 
															+        "bag_of_words",
														
 
															+        "bag_of_pos",
														
 
															+        "bag_of_words_in_between",
														
 
															+        "bag_of_pos_in_between"
														
 
															+    ],
														
 
															+    "dense_features": [
														
 
															+        "entity_order",
														
 
															+        "entity_distance",
														
 
															+        "other_entities_in_between",
														
 
															+        "verbs_count_in_between",
														
 
															+        "verbs_count",
														
 
															+        "total_number_of_entities",
														
 
															+        "symbols_in_between",
														
 
															+        "number_of_tokens"
														
 
															+    ],
														
 
															+    "classifier_args": {},
														
 
															+    "classifier": "svc"
														
 
															+}
														
--- a/examples/credit/format.py
+++ b/examples/credit/format.py
@@ -0,0 +1,6 @@
 
															+import time
														
 
															+a = [1462636800, 1606492800]
														
 
															+print(time.time()-86400*4)
														
 
															+
														
 
															+for item in a:
														
 
															+    print(time.strftime('%Y-%m-%d',time.localtime(1606377029)))
														
--- a/examples/credit/rules.py
+++ b/examples/credit/rules.py
@@ -0,0 +1,2 @@
 
															+# Write here your rules
														
 
															+# RELATION = 'your relation here'
														
--- a/examples/credit/settings.py
+++ b/examples/credit/settings.py
@@ -0,0 +1,182 @@
 
															+"""
														
 
															+For more information on this file, see
														
 
															+https://docs.djangoproject.com/en/1.7/topics/settings/
														
 
															+
														
 
															+For the full list of settings and their values, see
														
 
															+https://docs.djangoproject.com/en/1.7/ref/settings/
														
 
															+"""
														
 
															+
														
 
															+from iepy.webui.webui.settings import *
														
 
															+from django.conf import settings
														
 
															+
														
 
															+IEPY_VERSION = '0.9.6'
														
 
															+IEPY_LANG = 'en'
														
 
															+SECRET_KEY = 'u==!fueit=wxo&j8!5u+sfasp4prjluk@*s=7!-wz_&r@pn))r'
														
 
															+DEBUG = True
														
 
															+TEMPLATE_DEBUG = True
														
 
															+
														
 
															+# Database
														
 
															+# https://docs.djangoproject.com/en/1.7/ref/settings/#databases
														
 
															+# DATABASES = {
														
 
															+#     'default': {
														
 
															+#         'ENGINE': 'django.db.backends.sqlite3',
														
 
															+#         'NAME': '/home/python/luojiehua/dl_nlp/iepy-develop/examples/test/test.sqlite',
														
 
															+#     }
														
 
															+# }
														
 
															+DATABASES = {
														
 
															+    'default': {
														
 
															+        'ENGINE': 'django.db.backends.postgresql_psycopg2',
														
 
															+        'NAME': 'iepy_credit',
														
 
															+        'USER': 'postgres',
														
 
															+        'PASSWORD': 'postgres',
														
 
															+        'HOST': '192.168.2.101',
														
 
															+        'PORT': '5432'
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+# For changing tokenization options, read here.
														
 
															+# http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/process/PTBTokenizer.html
														
 
															+# You can use as key any of the "known options" listed on that page, and as value,
														
 
															+# use True or False (python names) for booleans, or strings when option requires a text
														
 
															+# CORENLP_TKN_OPTS = {
														
 
															+#     'latexQuotes': False
														
 
															+# }
														
 
															+
														
 
															+#default brat settings
														
 
															+CONFIG_BRAT = {
														
 
															+    "acl.conf":"""
														
 
															+User-agent: *
														
 
															+Allow: /
														
 
															+Disallow: /hidden/
														
 
															+
														
 
															+User-agent: guest
														
 
															+Disallow: /confidential/
														
 
															+""",
														
 
															+    "annotation.conf":"""
														
 
															+[spans]
														
 
															+punishment_code
														
 
															+code
														
 
															+name
														
 
															+money
														
 
															+	money_tendereeMoney
														
 
															+	money_tendererMoney
														
 
															+org
														
 
															+	org_tenderee
														
 
															+	org_agency
														
 
															+	org_tenderer
														
 
															+	org_secondTenderer
														
 
															+	org_thirdTenderer
														
 
															+company
														
 
															+	company_tenderee
														
 
															+	company_agency
														
 
															+	company_tenderer
														
 
															+	company_secondTenderer
														
 
															+	company_thirdTenderer
														
 
															+job
														
 
															+person
														
 
															+	person_tendereePerson
														
 
															+	person_agencyPerson
														
 
															+	person_person
														
 
															+	person_review
														
 
															+time
														
 
															+	time_release
														
 
															+	time_bidopen
														
 
															+	time_bidclose
														
 
															+location
														
 
															+package
														
 
															+phone
														
 
															+moneysource
														
 
															+bidway
														
 
															+serviceTime
														
 
															+[relations]
														
 
															+Equiv	Arg1:org|company|org_tenderee|org_agency|org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderee|company_agency|company_tenderer|company_secondTenderer|company_thirdTenderer, Arg2:org|company|org_tenderee|org_agency|org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderee|company_agency|company_tenderer|company_secondTenderer|company_thirdTenderer, <REL-TYPE>:symmetric-transitive
														
 
															+rel_tendererMoney	Arg1:org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderer|company_secondTenderer|company_thirdTenderer|org|company, Arg2:money_tendererMoney
														
 
															+rel_tendereeMoney	Arg1:package, Arg2:money_tendereeMoney|money
														
 
															+rel_person	Arg1:org|company|org_tenderee|org_agency|org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderee|company_agency|company_tenderer|company_secondTenderer|company_thirdTenderer, Arg2:person_tendereePerson|person_agencyPerson|person_person
														
 
															+rel_pack	Arg1:org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderer|company_secondTenderer|company_thirdTenderer, Arg2:package
														
 
															+rel_address	Arg1:org|company|org_tenderee|org_agency|org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderee|company_agency|company_tenderer|company_secondTenderer|company_thirdTenderer, Arg2:location
														
 
															+rel_phone	Arg1:person_tendereePerson|person_agencyPerson|person_person, Arg2:phone
														
 
															+rel_pack_code	Arg1:package, Arg2:code
														
 
															+rel_pack_name	Arg1:package, Arg2:name
														
 
															+
														
 
															+
														
 
															+[events]
														
 
															+#Protein_binding|GO:0005515	Theme+:Protein
														
 
															+#Gene_expression|GO:0010467	Theme:Protein
														
 
															+
														
 
															+[attributes]
														
 
															+#att_role	Arg:<ENTITY>, Value:招标人|代理人|中标人|第二候选人|第三候选人|att_noRole
														
 
															+#att_role	Arg:<ENTITY>, Value:att_tenderee|att_agency|att_tenderer|att_secondTenderer|att_thirdTenderer|att_noRole
														
 
															+#att_money	Arg:<ENTITY>, Value:att_tendereeMoney|att_tendererMoney|att_nomoney
														
 
															+#att_person	Arg:<ENTITY>, Value:att_noperson|att_tendereePerson|att_agencyPerson|att_person
														
 
															+#Negation	Arg:<EVENT>
														
 
															+#Speculation	Arg:<EVENT>
														
 
															+""",
														
 
															+    "visual.conf":"""
														
 
															+[labels]
														
 
															+punishment_code | 处罚编号
														
 
															+code | 项目编号
														
 
															+name | 项目名称
														
 
															+org | 组织
														
 
															+company | 公司
														
 
															+job | 职业
														
 
															+person | 人名
														
 
															+time | 时间
														
 
															+location | 地址
														
 
															+package | 包号
														
 
															+phone | 电话
														
 
															+money | 金额
														
 
															+money_tendereeMoney | 招标金额
														
 
															+money_tendererMoney | 中投标金额
														
 
															+
														
 
															+org_tenderee | 招标人
														
 
															+org_agency | 代理人
														
 
															+org_tenderer | 中标人
														
 
															+org_secondTenderer | 第二候选人
														
 
															+org_thirdTenderer | 第三候选人
														
 
															+company_tenderee | 招标人
														
 
															+company_agency | 代理人
														
 
															+company_tenderer | 中标人
														
 
															+company_secondTenderer | 第二候选人
														
 
															+company_thirdTenderer | 第三候选人
														
 
															+
														
 
															+person_tendereePerson | 招标联系人
														
 
															+person_agencyPerson | 代理联系人
														
 
															+person_person | 联系人
														
 
															+
														
 
															+rel_tendererMoney | 中投标金额
														
 
															+rel_tendereeMoney | 招标金额
														
 
															+rel_person | 联系人
														
 
															+rel_pack | 所属包
														
 
															+rel_address | 地址
														
 
															+rel_phone | 联系电话
														
 
															+rel_pack_code | 包件编号
														
 
															+rel_pack_name | 包件名称
														
 
															+
														
 
															+person_review | 评审专家
														
 
															+time_release | 发布时间
														
 
															+time_bidopen | 开标时间
														
 
															+time_bidclose | 截标时间
														
 
															+moneysource | 资金来源
														
 
															+bidway | 招标方式
														
 
															+serviceTime | 服务期限
														
 
															+
														
 
															+#Protein | Protein | Pro | P
														
 
															+#Protein_binding | Protein binding | Binding | Bind
														
 
															+#Gene_expression | Gene expression | Expression | Exp
														
 
															+#Theme | Theme | Th
														
 
															+
														
 
															+[drawing]
														
 
															+Protein	bgColor:#7fa2ff
														
 
															+SPAN_DEFAULT	fgColor:black, bgColor:lightgreen, borderColor:black
														
 
															+ARC_DEFAULT	color:black
														
 
															+ATTRIBUTE_DEFAULT	glyph:*
														
 
															+""",
														
 
															+    "tools.conf":"""
														
 
															+[search]
														
 
															+google     <URL>:http://www.google.com/search?q=%s
														
 
															+""",
														
 
															+    "kb_shortcuts.conf":"""
														
 
															+P	Protein
														
 
															+"""
														
 
															+}
														
--- a/examples/credit/test.sqlite
+++ b/examples/credit/test.sqlite
--- a/examples/product/__init__.py
+++ b/examples/product/__init__.py
@@ -0,0 +1 @@
 
															+from . import rules
														
--- a/examples/product/annotation.conf
+++ b/examples/product/annotation.conf
@@ -0,0 +1,49 @@
 
															+# -*- Mode: Text; tab-width: 8; indent-tabs-mode: nil; coding: utf-8; -*-
														
 
															+# vim:set ft=conf ts=2 sw=2 sts=2 autoindent:
														
 
															+
														
 
															+# Simple text-based definitions of entity, relation and event types
														
 
															+# and event attributes for the BioNLP Shared Task 2011 EPI task.
														
 
															+
														
 
															+
														
 
															+[entities]
														
 
															+
														
 
															+Protein
														
 
															+	abc
														
 
															+Entity
														
 
															+
														
 
															+
														
 
															+[relations]
														
 
															+
														
 
															+Equiv	Arg1:Protein, Arg2:Protein, <REL-TYPE>:symmetric-transitive
														
 
															+Equiv	Arg1:Entity, Arg2:Entity, <REL-TYPE>:symmetric-transitive
														
 
															+
														
 
															+# (No entity nestings permitted in EPI. Could be defined using special
														
 
															+# relation type ENTITY-NESTING if necessary.)
														
 
															+
														
 
															+
														
 
															+[events]
														
 
															+
														
 
															+Catalysis	Theme:<EVENT>, Cause:Protein
														
 
															+----------------------------------------
														
 
															+DNA_methylation|GO:0006306	Theme:Protein, Site?:Entity
														
 
															+DNA_demethylation|GO:0080111	Theme:Protein, Site?:Entity
														
 
															+----------------------------------------
														
 
															+Acetylation|GO:0006473	Theme:Protein, Site?:Entity, Contextgene?:Protein
														
 
															+Methylation|GO:0006479	Theme:Protein, Site?:Entity, Contextgene?:Protein
														
 
															+Glycosylation|GO:0006486	Theme:Protein, Site?:Entity, Sidechain?:Entity
														
 
															+Hydroxylation|GO:0018126	Theme:Protein, Site?:Entity
														
 
															+Phosphorylation|GO:0006468	Theme:Protein, Site?:Entity
														
 
															+Ubiquitination|GO:0016567	Theme:Protein, Site?:Entity
														
 
															+----------------------------------------
														
 
															+Deacetylation|GO:0006476	Theme:Protein, Site?:Entity, Contextgene?:Protein
														
 
															+Demethylation|GO:0006482	Theme:Protein, Site?:Entity, Contextgene?:Protein
														
 
															+Deglycosylation|GO:0006517	Theme:Protein, Site?:Entity, Sidechain?:Entity
														
 
															+Dehydroxylation|GO:-------	Theme:Protein, Site?:Entity
														
 
															+Dephosphorylation|GO:0006470	Theme:Protein, Site?:Entity
														
 
															+Deubiquitination|GO:0016579	Theme:Protein, Site?:Entity
														
 
															+
														
 
															+
														
 
															+[attributes]
														
 
															+
														
 
															+Negation	Arg:<EVENT>
														
 
															+Speculation	Arg:<EVENT>
														
--- a/examples/product/articles.csv
+++ b/examples/product/articles.csv
--- a/examples/product/bin/csv_to_iepy.py
+++ b/examples/product/bin/csv_to_iepy.py
@@ -0,0 +1,28 @@
 
															+"""
														
 
															+IEPY database loader from csv file
														
 
															+
														
 
															+Usage:
														
 
															+    csv_to_iepy.py <filename>
														
 
															+    csv_to_iepy.py -h | --help
														
 
															+
														
 
															+The <filename> argument can be a .csv file or a .csv.gz file containing the
														
 
															+corpus in two columns: 'freebase_mid' and 'description'.
														
 
															+
														
 
															+Options:
														
 
															+  -h --help             Show this screen
														
 
															+  --version             Version number
														
 
															+"""
														
 
															+
														
 
															+import logging
														
 
															+
														
 
															+from docopt import docopt
														
 
															+
														
 
															+import iepy
														
 
															+iepy.setup(__file__)
														
 
															+from iepy.utils import csv_to_iepy
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    logging.basicConfig(level=logging.INFO, format='%(message)s')
														
 
															+    opts = docopt(__doc__, version=iepy.__version__)
														
 
															+    filepath = opts["<filename>"]
														
 
															+    csv_to_iepy(filepath)
														
--- a/examples/product/bin/gazettes_loader.py
+++ b/examples/product/bin/gazettes_loader.py
@@ -0,0 +1,76 @@
 
															+"""
														
 
															+IEPY gazettes loader
														
 
															+
														
 
															+Usage:
														
 
															+    gazettes_loader.py <filename>
														
 
															+
														
 
															+
														
 
															+The <filename> argument can be a .csv file or a .csv.gz file containing the
														
 
															+gazettes in two columns: 'literal' and 'class'.
														
 
															+
														
 
															+
														
 
															+Options:
														
 
															+  -h --help             Show this screen
														
 
															+"""
														
 
															+
														
 
															+import sys
														
 
															+import csv
														
 
															+import gzip
														
 
															+import logging
														
 
															+from operator import itemgetter
														
 
															+
														
 
															+from django.db import IntegrityError
														
 
															+from docopt import docopt
														
 
															+
														
 
															+import iepy
														
 
															+iepy.setup(__file__)
														
 
															+from iepy.data.models import EntityKind, GazetteItem
														
 
															+
														
 
															+logging.basicConfig(level=logging.INFO, format='%(message)s')
														
 
															+
														
 
															+
														
 
															+def add_gazettes_from_csv(filepath):
														
 
															+    if filepath.endswith(".gz"):
														
 
															+        fin = gzip.open(filepath, "rt")
														
 
															+    else:
														
 
															+        fin = open(filepath, "rt")
														
 
															+    reader = csv.DictReader(fin)
														
 
															+
														
 
															+    expected_fnames = ['literal', 'class']
														
 
															+    if not set(reader.fieldnames).issuperset(expected_fnames):
														
 
															+        msg = "Couldn't find the expected field names on the provided csv: {}"
														
 
															+        sys.exit(msg.format(expected_fnames))
														
 
															+
														
 
															+    _create_gazette_entries(
														
 
															+        itemgetter(*expected_fnames)(line) for line in reader
														
 
															+    )
														
 
															+
														
 
															+
														
 
															+def _create_gazette_entries(entries_list):
														
 
															+    kind_cache = {}
														
 
															+    created = 0
														
 
															+    for literal, kind_name in entries_list:
														
 
															+        literal = literal.strip()
														
 
															+        kind_name = kind_name.strip()
														
 
															+        kind = kind_cache.get(kind_name)
														
 
															+        if kind is None:
														
 
															+            kind, _ = EntityKind.objects.get_or_create(name=kind_name)
														
 
															+            kind_cache[kind_name] = kind
														
 
															+        gazette = GazetteItem(text=literal, kind=kind)
														
 
															+
														
 
															+        try:
														
 
															+            gazette.save()
														
 
															+        except IntegrityError as error:
														
 
															+            logging.warn(
														
 
															+                "Gazette '{}' of class '{}' not loaded, literal already existed".format(
														
 
															+                literal, kind_name))
														
 
															+            print(error)
														
 
															+        finally:
														
 
															+            created += 1
														
 
															+    print('Created {} new gazette items'.format(created))
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    opts = docopt(__doc__, version=iepy.__version__)
														
 
															+    fname = opts["<filename>"]
														
 
															+    add_gazettes_from_csv(fname)
														
--- a/examples/product/bin/iepy_rules_runner.py
+++ b/examples/product/bin/iepy_rules_runner.py
@@ -0,0 +1,59 @@
 
															+"""
														
 
															+Run IEPY rule-based extractor
														
 
															+
														
 
															+Usage:
														
 
															+    iepy_rules_runner.py
														
 
															+    iepy_rules_runner.py -h | --help | --version
														
 
															+
														
 
															+Picks from rules.py the relation to work with, and the rules definitions and
														
 
															+proceeds with the extraction.
														
 
															+
														
 
															+Options:
														
 
															+  -h --help             Show this screen
														
 
															+  --version             Version number
														
 
															+"""
														
 
															+import sys
														
 
															+import logging
														
 
															+
														
 
															+from django.core.exceptions import ObjectDoesNotExist
														
 
															+
														
 
															+import iepy
														
 
															+iepy.setup(__file__)
														
 
															+
														
 
															+from iepy.extraction.rules import load_rules
														
 
															+from iepy.extraction.rules_core import RuleBasedCore
														
 
															+from iepy.data import models, output
														
 
															+from iepy.data.db import CandidateEvidenceManager
														
 
															+
														
 
															+
														
 
															+def run_from_command_line():
														
 
															+    logging.basicConfig(level=logging.INFO, format='%(message)s')
														
 
															+
														
 
															+    try:
														
 
															+        relation_name = iepy.instance.rules.RELATION
														
 
															+    except AttributeError:
														
 
															+        logging.error("RELATION not defined in rules file")
														
 
															+        sys.exit(1)
														
 
															+
														
 
															+    try:
														
 
															+        relation = models.Relation.objects.get(name=relation_name)
														
 
															+    except ObjectDoesNotExist:
														
 
															+        logging.error("Relation {!r} not found".format(relation_name))
														
 
															+        sys.exit(1)
														
 
															+
														
 
															+    # Load rules
														
 
															+    rules = load_rules()
														
 
															+
														
 
															+    # Load evidences
														
 
															+    evidences = CandidateEvidenceManager.candidates_for_relation(relation)
														
 
															+
														
 
															+    # Run the pipeline
														
 
															+    iextractor = RuleBasedCore(relation, rules)
														
 
															+    iextractor.start()
														
 
															+    iextractor.process()
														
 
															+    predictions = iextractor.predict(evidences)
														
 
															+    output.dump_output_loop(predictions)
														
 
															+
														
 
															+
														
 
															+if __name__ == u'__main__':
														
 
															+    run_from_command_line()
														
--- a/examples/product/bin/iepy_runner.py
+++ b/examples/product/bin/iepy_runner.py
@@ -0,0 +1,184 @@
 
															+"""
														
 
															+Run IEPY active-learning extractor
														
 
															+
														
 
															+Usage:
														
 
															+    iepy_runner.py [options] <relation_name> <output>
														
 
															+    iepy_runner.py [options] --db-store <relation_name>
														
 
															+    iepy_runner.py -h | --help | --version
														
 
															+
														
 
															+Options:
														
 
															+  --store-extractor=<extractor_output>     Stores the trained classifier
														
 
															+  --trained-extractor=<extractor_path>     Load an already trained extractor
														
 
															+  --db-store                               Stores the predictions on the database
														
 
															+  --no-questions                           Won't generate questions to answer. Will predict
														
 
															+                                           as is. Should be used with --trained-extractor
														
 
															+  --tune-for=<tune-for>                    Predictions tuning. Options are high-prec
														
 
															+                                           or high-recall [default: high-prec]
														
 
															+  --extractor-config=<config.json>         Sets the extractor config
														
 
															+  --version                                Version number
														
 
															+  -h --help                                Show this screen
														
 
															+"""
														
 
															+
														
 
															+import os
														
 
															+import json
														
 
															+import logging
														
 
															+from docopt import docopt
														
 
															+from sys import exit
														
 
															+
														
 
															+import iepy
														
 
															+INSTANCE_PATH = iepy.setup(__file__)
														
 
															+
														
 
															+from iepy.extraction.active_learning_core import ActiveLearningCore, HIPREC, HIREC
														
 
															+from iepy.data.db import CandidateEvidenceManager
														
 
															+from iepy.data.models import Relation
														
 
															+from iepy.extraction.terminal import TerminalAdministration
														
 
															+from iepy.data import output
														
 
															+
														
 
															+
														
 
															+def print_all_relations():
														
 
															+    print("All available relations:")
														
 
															+    for relation in Relation.objects.all():
														
 
															+        print("  {}".format(relation))
														
 
															+
														
 
															+
														
 
															+def load_labeled_evidences(relation, evidences):
														
 
															+    CEM = CandidateEvidenceManager  # shorcut
														
 
															+    return CEM.labels_for(relation, evidences, CEM.conflict_resolution_newest_wins)
														
 
															+
														
 
															+
														
 
															+def _get_tuning_mode(opts):
														
 
															+    if opts['--tune-for'] == 'high-prec':
														
 
															+        tuning_mode = HIPREC
														
 
															+    elif opts['--tune-for'] == 'high-recall':
														
 
															+        tuning_mode = HIREC
														
 
															+    else:
														
 
															+        print ('Invalid tuning mode')
														
 
															+        print (__doc__)
														
 
															+        exit(1)
														
 
															+    return tuning_mode
														
 
															+
														
 
															+
														
 
															+def _get_relation(opts):
														
 
															+    relation_name = opts['<relation_name>']
														
 
															+    try:
														
 
															+        relation = Relation.objects.get(name=relation_name)
														
 
															+    except Relation.DoesNotExist:
														
 
															+        print("Relation {!r} non existent".format(relation_name))
														
 
															+        print_all_relations()
														
 
															+        exit(1)
														
 
															+    return relation
														
 
															+
														
 
															+
														
 
															+def _load_extractor(opts, relation, labeled_evidences):
														
 
															+    extractor_path = opts.get('--trained-extractor')
														
 
															+    try:
														
 
															+        iextractor = ActiveLearningCore.load(extractor_path,
														
 
															+                                             labeled_evidences=labeled_evidences)
														
 
															+    except ValueError:
														
 
															+        print("Error: unable to load extractor, invalid file")
														
 
															+        exit(1)
														
 
															+
														
 
															+    if iextractor.relation != relation:
														
 
															+        print('The loaded extractor is not for the requested relation'
														
 
															+              ' but for relation {} instead'.format(iextractor.relation))
														
 
															+        exit(1)
														
 
															+    print('Extractor successfully loaded')
														
 
															+    return iextractor
														
 
															+
														
 
															+
														
 
															+def _construct_extractor(opts, relation, labeled_evidences, tuning_mode):
														
 
															+    config_filepath = opts.get("--extractor-config")
														
 
															+    if not config_filepath:
														
 
															+        config_filepath = os.path.join(INSTANCE_PATH, "extractor_config.json")
														
 
															+
														
 
															+    if not os.path.exists(config_filepath):
														
 
															+        print("Error: extractor config does not exists, please create the "
														
 
															+              "file extractor_config.json or use the --extractor-config")
														
 
															+        exit(1)
														
 
															+
														
 
															+    with open(config_filepath) as filehandler:
														
 
															+        try:
														
 
															+            extractor_config = json.load(filehandler)
														
 
															+        except Exception as error:
														
 
															+            print("Error: unable to load extractor config: {}".format(error))
														
 
															+            exit(1)
														
 
															+
														
 
															+    iextractor = ActiveLearningCore(
														
 
															+        relation, labeled_evidences, extractor_config, tradeoff=tuning_mode
														
 
															+    )
														
 
															+    return iextractor
														
 
															+
														
 
															+
														
 
															+def run_from_command_line():
														
 
															+    opts = docopt(__doc__, version=iepy.__version__)
														
 
															+
														
 
															+    logging.basicConfig(level=logging.INFO, format='%(message)s')
														
 
															+    logging.getLogger("featureforge").setLevel(logging.WARN)
														
 
															+
														
 
															+    tuning_mode = _get_tuning_mode(opts)
														
 
															+    relation = _get_relation(opts)
														
 
															+
														
 
															+    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
														
 
															+    labeled_evidences = load_labeled_evidences(relation, candidates)
														
 
															+
														
 
															+    if opts.get('--trained-extractor'):
														
 
															+        iextractor = _load_extractor(opts, relation, labeled_evidences)
														
 
															+        was_ever_trained = True
														
 
															+        opts["--no-questions"] = True
														
 
															+    else:
														
 
															+        iextractor = _construct_extractor(opts, relation, labeled_evidences, tuning_mode)
														
 
															+        iextractor.start()
														
 
															+        was_ever_trained = False
														
 
															+
														
 
															+    if not opts.get("--no-questions", False):
														
 
															+        questions_loop(iextractor, relation, was_ever_trained)
														
 
															+
														
 
															+    # Candidates generator was consumed when generating labeled_evidences, so we'll
														
 
															+    # define it fresh again
														
 
															+    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
														
 
															+    # Predict and store output
														
 
															+    predictions = iextractor.predict(candidates)  # asking predictions for EVERYTHING
														
 
															+    if not predictions:
														
 
															+        print("Nothing was predicted")
														
 
															+        exit(1)
														
 
															+
														
 
															+    if opts.get("--db-store"):
														
 
															+        output.dump_predictions_to_database(relation, predictions)
														
 
															+
														
 
															+    output_file = opts.get("<output>")
														
 
															+    if output_file:
														
 
															+        output.dump_runner_output_to_csv(predictions, output_file)
														
 
															+
														
 
															+    classifier_output = opts.get("--store-extractor")
														
 
															+    if classifier_output:
														
 
															+        iextractor.save(classifier_output)
														
 
															+
														
 
															+
														
 
															+def questions_loop(iextractor, relation, was_ever_trained):
														
 
															+    STOP = u'STOP'
														
 
															+    term = TerminalAdministration(
														
 
															+        relation,
														
 
															+        extra_options=[(STOP, u'Stop execution')]
														
 
															+    )
														
 
															+    while iextractor.questions:
														
 
															+        questions = list(iextractor.questions)  # copying the list
														
 
															+        term.update_candidate_evidences_to_label(questions)
														
 
															+        result = term()
														
 
															+        i = 0
														
 
															+        for c, label_value in load_labeled_evidences(relation, questions).items():
														
 
															+            if label_value is not None:
														
 
															+                iextractor.add_answer(c, label_value)
														
 
															+                i += 1
														
 
															+        print ('Added %s new human labels to the extractor core' % i)
														
 
															+        iextractor.process()
														
 
															+        was_ever_trained = True
														
 
															+        if result == STOP:
														
 
															+            break
														
 
															+
														
 
															+    if not was_ever_trained:
														
 
															+        # It's needed to run some process before asking for predictions
														
 
															+        iextractor.process()
														
 
															+
														
 
															+
														
 
															+if __name__ == u'__main__':
														
 
															+    run_from_command_line()
														
--- a/examples/product/bin/manage.py
+++ b/examples/product/bin/manage.py
@@ -0,0 +1,12 @@
 
															+#!/usr/bin/env python
														
 
															+
														
 
															+import sys
														
 
															+
														
 
															+from django.core.management import execute_from_command_line
														
 
															+
														
 
															+import iepy
														
 
															+iepy.setup(__file__)
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    execute_from_command_line(sys.argv)
														
--- a/examples/product/bin/preprocess.py
+++ b/examples/product/bin/preprocess.py
@@ -0,0 +1,96 @@
 
															+"""
														
 
															+Corpus preprocessing script
														
 
															+
														
 
															+Usage:
														
 
															+    preprocess.py [options]
														
 
															+    preprocess.py --split-in=<num-splits> --run-part=<num-part>
														
 
															+    preprocess.py --increment-ner
														
 
															+    preprocess.py -h | --help | --version
														
 
															+
														
 
															+Options:
														
 
															+  -h --help                      Show this screen
														
 
															+  --multiple-cores=<num-cores>   Number of cores (use all to use every processor)
														
 
															+  --increment-ner                Re run NER and Gazetter for every document. If a document lacked any of the previous steps, will be preprocessed entirely.
														
 
															+  --version                      Version number
														
 
															+"""
														
 
															+import logging
														
 
															+
														
 
															+from docopt import docopt
														
 
															+
														
 
															+import os
														
 
															+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
														
 
															+os.environ["CUDA_VISIBLE_DEVICES"] = ""
														
 
															+
														
 
															+import iepy
														
 
															+import multiprocessing
														
 
															+iepy.setup(__file__)
														
 
															+from iepy.data.db import DocumentManager
														
 
															+from iepy.selfpreprocess.self_preprocess import SelfPreprocesser
														
 
															+from iepy.selfpreprocess.pipeline import PreProcessPipeline, PreProcessSteps
														
 
															+# from iepy.preprocess.stanford_preprocess import StanfordPreprocess
														
 
															+# from iepy.preprocess.pipeline import PreProcessPipeline, PreProcessSteps
														
 
															+# from iepy.preprocess.segmenter import SyntacticSegmenterRunner
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+class ParallelDocManager(DocumentManager):
														
 
															+
														
 
															+    def mines_of(self, qset, number_of_processors, my_id):
														
 
															+        K = number_of_processors
														
 
															+        N = my_id
														
 
															+        clause = 'id %%%% %s = %s' % (K, N)
														
 
															+        return qset.extra(where=[clause])
														
 
															+
														
 
															+def start_preprocess(docs, increment_ner):
														
 
															+    pipeline = PreProcessPipeline([
														
 
															+        SelfPreprocesser(increment_ner),
														
 
															+        # SyntacticSegmenterRunner(increment=True)
														
 
															+    ], docs)
														
 
															+    pipeline.process_everything()
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    logger = logging.getLogger(u'preprocess')
														
 
															+    logger.setLevel(logging.INFO)
														
 
															+    logging.basicConfig(level=logging.INFO, format='%(message)s')
														
 
															+    opts = docopt(__doc__, version=iepy.__version__)
														
 
															+    increment_ner = opts['--increment-ner']
														
 
															+
														
 
															+    dm = ParallelDocManager()
														
 
															+    all_docs = dm.get_documents_lacking_preprocess(
														
 
															+        [PreProcessSteps.brat])
														
 
															+
														
 
															+    multiple_cores = opts.get('--multiple-cores')
														
 
															+    split_in = opts.get("--split-in")
														
 
															+    run_part = opts.get("--run-part")
														
 
															+
														
 
															+    if multiple_cores:
														
 
															+        if multiple_cores == "all":
														
 
															+            multiple_cores = multiprocessing.cpu_count()
														
 
															+        try:
														
 
															+            multiple_cores = int(multiple_cores)
														
 
															+        except ValueError:
														
 
															+            logger.error("Invalid number of cores")
														
 
															+            exit(1)
														
 
															+
														
 
															+        for i in range(multiple_cores):
														
 
															+            process = multiprocessing.Process(
														
 
															+                target=start_preprocess, args=(dm.mines_of(all_docs, multiple_cores, i), increment_ner)
														
 
															+            )
														
 
															+            process.start()
														
 
															+    elif split_in:
														
 
															+        try:
														
 
															+            split_in = int(split_in)
														
 
															+            run_part = int(run_part) - 1
														
 
															+        except ValueError:
														
 
															+            logger.error("Invalid split")
														
 
															+            exit(1)
														
 
															+
														
 
															+        if run_part < 0 or run_part > split_in:
														
 
															+            logger.error("Parts must be between 1 and {}".format(split_in))
														
 
															+            exit(1)
														
 
															+
														
 
															+        docs = dm.mines_of(all_docs, split_in, run_part)
														
 
															+        start_preprocess(docs, increment_ner)
														
 
															+    else:
														
 
															+        start_preprocess(all_docs, increment_ner)
														
--- a/examples/product/bin/rules_verifier.py
+++ b/examples/product/bin/rules_verifier.py
@@ -0,0 +1,149 @@
 
															+"""
														
 
															+IEPY rules verifier
														
 
															+
														
 
															+
														
 
															+Usage:
														
 
															+    rules_verifier.py <relation> [options]
														
 
															+
														
 
															+Options:
														
 
															+  --shuffle             Chooses the sample randomly and not the first ones
														
 
															+  --create-evidences    Creates evidences that are missing [default: false]
														
 
															+  -r --rule=<rule>      Tests only this rule
														
 
															+  -l --limit=<limit>    Limits the amount of evidences uses
														
 
															+  -h --help             Show this screen
														
 
															+"""
														
 
															+
														
 
															+import sys
														
 
															+import logging
														
 
															+from docopt import docopt
														
 
															+
														
 
															+import refo
														
 
															+from django.core.exceptions import ObjectDoesNotExist
														
 
															+from colorama import init as colorama_init
														
 
															+
														
 
															+import iepy
														
 
															+iepy.setup(__file__)
														
 
															+
														
 
															+from iepy.data import models
														
 
															+from iepy.data.models import EvidenceCandidate
														
 
															+from iepy.data.db import CandidateEvidenceManager
														
 
															+from iepy.extraction.terminal import TerminalEvidenceFormatter
														
 
															+from iepy.extraction.rules import (
														
 
															+    load_rules, compile_rule, generate_tokens_to_match
														
 
															+)
														
 
															+from iepy.metrics import result_dict_from_predictions
														
 
															+
														
 
															+
														
 
															+logging.basicConfig(level=logging.INFO, format='%(message)s')
														
 
															+
														
 
															+
														
 
															+def run_from_command_line():
														
 
															+    opts = docopt(__doc__, version=iepy.__version__)
														
 
															+    relation_name = opts.get("<relation>")
														
 
															+    limit = opts.get("--limit")
														
 
															+    rule_name = opts.get("--rule")
														
 
															+    shuffle = opts.get("--shuffle")
														
 
															+    create_evidences = opts.get("--create-evidences")
														
 
															+
														
 
															+    if limit is None:
														
 
															+        limit = -1
														
 
															+
														
 
															+    try:
														
 
															+        limit = int(limit)
														
 
															+    except ValueError:
														
 
															+        logging.error("Invalid limit value, it must be a number")
														
 
															+        sys.exit(1)
														
 
															+
														
 
															+    try:
														
 
															+        relation = models.Relation.objects.get(name=relation_name)
														
 
															+    except ObjectDoesNotExist:
														
 
															+        logging.error("Relation {!r} not found".format(relation_name))
														
 
															+        sys.exit(1)
														
 
															+
														
 
															+    # Load rules
														
 
															+    rules = get_rules(rule_name)
														
 
															+    rule_regexes = [
														
 
															+        (rule.__name__, compile_rule(rule, relation), rule.answer) for rule in rules
														
 
															+    ]
														
 
															+
														
 
															+    # Load evidences
														
 
															+    if EvidenceCandidate.objects.all().count() == 0:
														
 
															+        create_evidences = True
														
 
															+    evidences = CandidateEvidenceManager.candidates_for_relation(
														
 
															+        relation, create_evidences, seg_limit=limit, shuffle_segs=shuffle
														
 
															+    )
														
 
															+    conflict_solver = CandidateEvidenceManager.conflict_resolution_newest_wins
														
 
															+    answers = CandidateEvidenceManager.labels_for(
														
 
															+        relation, evidences, conflict_solver
														
 
															+    )
														
 
															+    run_tests(rule_regexes, evidences, answers)
														
 
															+
														
 
															+
														
 
															+def run_tests(rule_regexes, evidences, answers):
														
 
															+    predictions = []
														
 
															+    real_labels = []
														
 
															+    evidences_with_labels = []
														
 
															+
														
 
															+    colorama_init()
														
 
															+    formatter = TerminalEvidenceFormatter()
														
 
															+
														
 
															+    for name, regex, answer in rule_regexes:
														
 
															+        title = "Matches for rule '{}' (value: {})".format(name, answer)
														
 
															+        print("\n{}\n{}".format(title, "-" * len(title)))
														
 
															+
														
 
															+        anything_matched = False
														
 
															+        for evidence in evidences:
														
 
															+            tokens_to_match = generate_tokens_to_match(evidence)
														
 
															+            match = refo.match(regex, tokens_to_match)
														
 
															+
														
 
															+            if match:
														
 
															+                anything_matched = True
														
 
															+                print("  * {}".format(formatter.colored_text(evidence)))
														
 
															+
														
 
															+            if evidence in answers and answers[evidence] is not None:
														
 
															+                evidences_with_labels.append(evidence)
														
 
															+                real_labels.append(answers[evidence])
														
 
															+
														
 
															+                if match:
														
 
															+                    predictions.append(answer)
														
 
															+                else:
														
 
															+                    predictions.append(False)
														
 
															+
														
 
															+        if not anything_matched:
														
 
															+            print("  nothing matched")
														
 
															+
														
 
															+        print()
														
 
															+
														
 
															+    if real_labels:
														
 
															+        results = result_dict_from_predictions(
														
 
															+            evidences_with_labels, real_labels, predictions
														
 
															+        )
														
 
															+        results.pop("end_time")
														
 
															+        keys = [
														
 
															+            "true_positives", "true_negatives",
														
 
															+            "false_positives", "false_negatives",
														
 
															+            "precision", "recall",
														
 
															+            "accuracy", "f1",
														
 
															+        ]
														
 
															+
														
 
															+        title = "Metrics"
														
 
															+        print("{}\n{}".format(title, "-" * len(title)))
														
 
															+        for key in keys:
														
 
															+            print("{:>15}: {:.2f}".format(key, results[key]))
														
 
															+
														
 
															+
														
 
															+def get_rules(rule_name):
														
 
															+    # Load rules
														
 
															+    rules = load_rules()
														
 
															+
														
 
															+    if rule_name:
														
 
															+        rules = [x for x in rules if x.__name__ == rule_name]
														
 
															+        if not rules:
														
 
															+            logging.error("rule '{}' does not exists".format(rule_name))
														
 
															+            sys.exit(1)
														
 
															+
														
 
															+    return rules
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    run_from_command_line()
														
--- a/examples/product/bin/settlement.py
+++ b/examples/product/bin/settlement.py
@@ -0,0 +1,241 @@
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+from django.db.models import Q
														
 
															+import datetime,time
														
 
															+import iepy
														
 
															+iepy.setup(__file__)
														
 
															+from iepy.data.db import DocumentManager
														
 
															+from iepy.data.models import IEDocument,LabeledIEDocument,IEDocumentMetadata,LabeledIEDocumentMetadata,Payroll
														
 
															+from brat.models import BratAnnotation,LabeledBratAnnotation
														
 
															+from django.db import transaction
														
 
															+import pandas as pd
														
 
															+from django.contrib.auth.models import User
														
 
															+
														
 
															+def object_to_dict(obj,class_model):
														
 
															+    '''
														
 
															+    :param obj:对象
														
 
															+    :param class_model:django model
														
 
															+    :return: 由对象生成的键值对
														
 
															+    '''
														
 
															+    _dict = {}
														
 
															+    concrete_model = class_model._meta.concrete_model
														
 
															+    for field in concrete_model._meta.local_fields:
														
 
															+        value = field.value_from_object(obj)
														
 
															+        _dict[field.name] = value
														
 
															+    return _dict
														
 
															+
														
 
															+
														
 
															+class Settlement():
														
 
															+
														
 
															+    '''
														
 
															+    @summary: 结算类，定义了结算者所需要执行的各种方法
														
 
															+    '''
														
 
															+
														
 
															+    def makePayroll(self,_user,time_begin,time_end):
														
 
															+        '''
														
 
															+        :param _user: 用户名
														
 
															+        :param time_begin: 起始时间
														
 
															+        :param time_end: 截至时间
														
 
															+        :return:根据用户，时间段生成用户的标注情况
														
 
															+        '''
														
 
															+        from django.db import connection
														
 
															+        with transaction.atomic():
														
 
															+            cursor = connection.cursor()
														
 
															+            sql = " select count(1) from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>='%s'"%(_user,time_end,time_begin)
														
 
															+            cursor.execute(sql)
														
 
															+            doc_count = cursor.fetchall()[0][0]
														
 
															+            sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>='%s' order by edittime desc limit 1200) and value like '%s' "%(_user,time_end,time_begin,"T%")
														
 
															+            cursor.execute(sql)
														
 
															+            t_count = cursor.fetchall()[0][0]
														
 
															+            sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>='%s' order by edittime desc limit 1200) and value like '%s' "%(_user,time_end,time_begin,"R%")
														
 
															+            cursor.execute(sql)
														
 
															+            r_count = cursor.fetchall()[0][0]
														
 
															+            sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>='%s') and value like '%s' "%(_user,time_end,time_begin,"T%")
														
 
															+            cursor.execute(sql)
														
 
															+            all_t_count = cursor.fetchall()[0][0]
														
 
															+            sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>='%s') and value like '%s' "%(_user,time_end,time_begin,"R%")
														
 
															+            cursor.execute(sql)
														
 
															+            all_r_count = cursor.fetchall()[0][0]
														
 
															+            wage = round(0.03*t_count+0.05*r_count+(all_t_count-t_count)*0.04+(all_r_count-r_count)*0.06,2)
														
 
															+            print(doc_count,t_count,r_count,wage)
														
 
															+            payrolls = Payroll.objects.filter(Q(user=_user)& Q(begin_time=time_begin) & Q(end_time=time_end))
														
 
															+            if len(payrolls)==0:
														
 
															+                _payroll = Payroll.objects.create(**{"user":_user,"doc_count":doc_count,"begin_time":time_begin,"end_time":time_end,"t_count":all_t_count,"r_count":all_r_count,"wage":wage})
														
 
															+                _payroll.save()
														
 
															+            else:
														
 
															+                _payroll = payrolls[0]
														
 
															+                _payroll.doc_count = doc_count
														
 
															+                _payroll.t_count = all_t_count
														
 
															+                _payroll.r_count = all_r_count
														
 
															+                _payroll.wage = wage
														
 
															+                _payroll.save()
														
 
															+
														
 
															+    def exportPayroll(self,begin_time,end_time):
														
 
															+        '''
														
 
															+        :param begin_time: 导出开始时间
														
 
															+        :param end_time: 导出结束时间
														
 
															+        :return:
														
 
															+        '''
														
 
															+        list_user = []
														
 
															+        list_doc_count = []
														
 
															+        list_t_count = []
														
 
															+        list_r_count = []
														
 
															+        list_wage = []
														
 
															+        list_yield = []
														
 
															+        list_account = []
														
 
															+        list_begin = []
														
 
															+        list_end = []
														
 
															+        if begin_time is not None:
														
 
															+            payrolls = Payroll.objects.filter(Q(begin_time=begin_time) & Q(end_time=end_time))
														
 
															+        else:
														
 
															+            payrolls = Payroll.objects.filter(Q(end_time=end_time))
														
 
															+        for _payroll in payrolls:
														
 
															+            list_user.append(_payroll.user)
														
 
															+            list_doc_count.append(_payroll.doc_count)
														
 
															+            list_t_count.append(_payroll.t_count)
														
 
															+            list_r_count.append(_payroll.r_count)
														
 
															+            list_wage.append(_payroll.wage)
														
 
															+            list_yield.append(_payroll._yield)
														
 
															+            list_account.append(_payroll.account)
														
 
															+            list_begin.append(_payroll.begin_time)
														
 
															+            list_end.append(_payroll.end_time)
														
 
															+        df = pd.DataFrame({"用户":list_user,"开始时间":list_begin,"结束时间":list_end,"文章数":list_doc_count,"要素数":list_t_count,"关系数":list_r_count,"总价":list_wage,"合格率":list_yield,"结算价":list_account})
														
 
															+        df.to_excel("%s-%s要素标注统计.xls"%(begin_time,end_time),columns=["用户","开始时间","结束时间","文章数","要素数","关系数","总价","合格率","结算价"])
														
 
															+
														
 
															+    def getAllUser(self):
														
 
															+        from django.db import connection
														
 
															+        with transaction.atomic():
														
 
															+            list_user = []
														
 
															+            cursor = connection.cursor()
														
 
															+            sql = "select username from auth_user where is_staff='t'"
														
 
															+            cursor.execute(sql)
														
 
															+            for row in cursor.fetchall():
														
 
															+                list_user.append(row[0])
														
 
															+            return list_user
														
 
															+
														
 
															+
														
 
															+    def makeMigrate(self,_user,time_begin,time_end):
														
 
															+        '''
														
 
															+        :param _user: 用户名
														
 
															+        :param time_begin: 起始时间
														
 
															+        :param time_end: 截至时间
														
 
															+        :return: 将用户在时间段内的数据迁移到标准表中
														
 
															+        '''
														
 
															+        pass
														
 
															+        # from django.db import connection
														
 
															+        # with transaction.atomic():
														
 
															+        #     cursor = connection.cursor()
														
 
															+        #     sql = " select human_identifier,offsets_to_text,sentences from corpus_iedocument where edituser is null"
														
 
															+        #     cursor.execute(sql)
														
 
															+        #     cursor1 = connection.cursor()
														
 
															+        #     _index = 0
														
 
															+        #     rows = True
														
 
															+        #     while(rows):
														
 
															+        #         rows=cursor.fetchmany(1000)
														
 
															+        #         for row in rows:
														
 
															+        #             _index += 1
														
 
															+        #             print(_index)
														
 
															+        #             human_identifier,offsets_to_text,sentences = row
														
 
															+        #             if sentences!="[]":
														
 
															+        #                 _off = offsets_to_text.split(", ")[-1][:-1]
														
 
															+        #                 _sen = sentences.split(", ")[-1][:-1]
														
 
															+        #                 print(_off,_sen)
														
 
															+        #                 if int(_off)!=int(_sen):
														
 
															+        #                     offsets_to_text = offsets_to_text[:-1]+", "+str(int(_sen))+"]"
														
 
															+        #                     print(offsets_to_text)
														
 
															+        #                     cursor1.execute("update corpus_iedocument set offsets_to_text='%s' where human_identifier='%s'"%(offsets_to_text,human_identifier))
														
 
															+
														
 
															+
														
 
															+
														
 
															+            # ieDocuments = IEDocument.objects.filter(Q(edituser=_user) & Q(edittime__range=(time_begin,time_end)))
														
 
															+            # for obj in ieDocuments:
														
 
															+            #     _dict = object_to_dict(obj,IEDocument)
														
 
															+            #     _dict_meta = object_to_dict(obj.metadata,IEDocumentMetadata)
														
 
															+            #     labeledMeta = LabeledIEDocumentMetadata.objects.create(**_dict_meta)
														
 
															+            #     labeledMeta.save()
														
 
															+            #     _dict["metadata"] = labeledMeta
														
 
															+            #     tmp = LabeledIEDocument.objects.create(**_dict)
														
 
															+            #     tmp.save()
														
 
															+            #
														
 
															+            #     bratAnnotations = BratAnnotation.objects.filter(Q(document_id=obj.human_identifier))
														
 
															+            #     for ann in bratAnnotations:
														
 
															+            #         _dict_ann = object_to_dict(ann,BratAnnotation)
														
 
															+            #         labeledAnn = LabeledBratAnnotation.objects.create(**_dict_ann)
														
 
															+            #         labeledAnn.save()
														
 
															+
														
 
															+
														
 
															+    def getPercentOfPass(self,_user,time_begin,time_end):
														
 
															+        '''
														
 
															+        :param _user:用户名
														
 
															+        :param time_begin: 起始时间
														
 
															+        :param time_end: 截至时间
														
 
															+        :return: 获得用户在时间段内标注数据的合格率
														
 
															+        '''
														
 
															+
														
 
															+    def makePayrolls(self,time_begin,time_end):
														
 
															+        '''
														
 
															+        :param time_begin:起始时间
														
 
															+        :param time_end: 截至时间
														
 
															+        :return: 获得所有用户的工资表
														
 
															+        '''
														
 
															+        for _user in self.getAllUser():
														
 
															+            self.makePayroll(_user,time_begin,time_end)
														
 
															+        self.exportPayroll(time_begin,time_end)
														
 
															+
														
 
															+    def createUser_batch(self,batch_size=90):
														
 
															+        '''
														
 
															+        :param batch_size: 用户个数
														
 
															+        :return:
														
 
															+        '''
														
 
															+        list_user = [User.objects.create_user(username="bidi%d"%(i+1),password="bidi%d"%(i+1)) for i in range(batch_size)]
														
 
															+
														
 
															+    def exportLabels(self):
														
 
															+        groups = [[1,7],[8,14],[15,22],[23,29],[30,36],[37,43],[44,50],[51,56],[57,62],[63,71]]
														
 
															+        from django.db import connection
														
 
															+        cursor = connection.cursor()
														
 
															+        for _i in range(len(groups)):
														
 
															+            _begin,_end = groups[_i]
														
 
															+            list_username = []
														
 
															+            list_user = []
														
 
															+            list_label = []
														
 
															+            list_time = []
														
 
															+            for _j in range(_begin,_end+1):
														
 
															+                username = "bidi%d"%_j
														
 
															+                list_username.append("'%s'"%username)
														
 
															+            sql = " select edituser,human_identifier,to_char(edittime,'yyyy-mm-dd') from corpus_iedocument where edituser in(%s) order by edittime asc"%(",".join(list_username))
														
 
															+            print(sql)
														
 
															+            cursor.execute(sql)
														
 
															+            rows = cursor.fetchall()
														
 
															+            for row in rows:
														
 
															+                list_user.append(row[0])
														
 
															+                list_label.append(row[1])
														
 
															+                list_time.append(row[2])
														
 
															+            df = pd.DataFrame({"时间":list_time,"用户":list_user,"文章编号":list_label})
														
 
															+            df.to_excel("分组_%d.xls"%(_i+1),columns=["时间","用户","文章编号"])
														
 
															+
														
 
															+    def filter(self):
														
 
															+        '''
														
 
															+        过滤拍卖公告
														
 
															+        :return:
														
 
															+        '''
														
 
															+        import re
														
 
															+        ieDocuments = IEDocument.objects.all()
														
 
															+        for obj in ieDocuments:
														
 
															+            if re.search("拍卖",obj.text) is not None:
														
 
															+                obj.jump_signal = 1
														
 
															+                obj.save()
														
 
															+                print(obj.human_identifier)
														
 
															+
														
 
															+
														
 
															+
														
 
															+if __name__=="__main__":
														
 
															+    settle = Settlement()
														
 
															+    # settle.makeMigrate("test","2020-08-01","2020-08-31")
														
 
															+    # settle.makePayroll("test17","2020-08-01","2020-10-31")
														
 
															+    # settle.makePayrolls("2020-08-01","2020-08-31")
														
 
															+    settle.exportPayroll(begin_time=None,end_time='2020-10-31')
														
 
															+    # settle.createUser_batch(batch_size=102)
														
 
															+    # settle.exportLabels()
														
 
															+    # settle.filter()
														
--- a/examples/product/extractor_config.json
+++ b/examples/product/extractor_config.json
@@ -0,0 +1,20 @@
 
															+{
														
 
															+    "sparse_features": [
														
 
															+        "bag_of_words",
														
 
															+        "bag_of_pos",
														
 
															+        "bag_of_words_in_between",
														
 
															+        "bag_of_pos_in_between"
														
 
															+    ],
														
 
															+    "dense_features": [
														
 
															+        "entity_order",
														
 
															+        "entity_distance",
														
 
															+        "other_entities_in_between",
														
 
															+        "verbs_count_in_between",
														
 
															+        "verbs_count",
														
 
															+        "total_number_of_entities",
														
 
															+        "symbols_in_between",
														
 
															+        "number_of_tokens"
														
 
															+    ],
														
 
															+    "classifier_args": {},
														
 
															+    "classifier": "svc"
														
 
															+}
														
--- a/examples/product/format.py
+++ b/examples/product/format.py
@@ -0,0 +1,6 @@
 
															+import time
														
 
															+a = [1462636800, 1606492800]
														
 
															+print(time.time()-86400*4)
														
 
															+
														
 
															+for item in a:
														
 
															+    print(time.strftime('%Y-%m-%d',time.localtime(1606377029)))
														
--- a/examples/product/product_article.csv
+++ b/examples/product/product_article.csv
--- a/examples/product/rules.py
+++ b/examples/product/rules.py
@@ -0,0 +1,2 @@
 
															+# Write here your rules
														
 
															+# RELATION = 'your relation here'
														
--- a/examples/product/settings.py
+++ b/examples/product/settings.py
@@ -0,0 +1,182 @@
 
															+"""
														
 
															+For more information on this file, see
														
 
															+https://docs.djangoproject.com/en/1.7/topics/settings/
														
 
															+
														
 
															+For the full list of settings and their values, see
														
 
															+https://docs.djangoproject.com/en/1.7/ref/settings/
														
 
															+"""
														
 
															+
														
 
															+from iepy.webui.webui.settings import *
														
 
															+from django.conf import settings
														
 
															+
														
 
															+IEPY_VERSION = '0.9.6'
														
 
															+IEPY_LANG = 'en'
														
 
															+SECRET_KEY = 'u==!fueit=wxo&j8!5u+sfasp4prjluk@*s=7!-wz_&r@pn))r'
														
 
															+DEBUG = True
														
 
															+TEMPLATE_DEBUG = True
														
 
															+
														
 
															+# Database
														
 
															+# https://docs.djangoproject.com/en/1.7/ref/settings/#databases
														
 
															+# DATABASES = {
														
 
															+#     'default': {
														
 
															+#         'ENGINE': 'django.db.backends.sqlite3',
														
 
															+#         'NAME': '/home/python/luojiehua/dl_nlp/iepy-develop/examples/test/test.sqlite',
														
 
															+#     }
														
 
															+# }
														
 
															+DATABASES = {
														
 
															+    'default': {
														
 
															+        'ENGINE': 'django.db.backends.postgresql_psycopg2',
														
 
															+        'NAME': 'iepy_product',
														
 
															+        'USER': 'postgres',
														
 
															+        'PASSWORD': 'postgres',
														
 
															+        'HOST': '192.168.2.101',
														
 
															+        'PORT': '5432'
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+# For changing tokenization options, read here.
														
 
															+# http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/process/PTBTokenizer.html
														
 
															+# You can use as key any of the "known options" listed on that page, and as value,
														
 
															+# use True or False (python names) for booleans, or strings when option requires a text
														
 
															+# CORENLP_TKN_OPTS = {
														
 
															+#     'latexQuotes': False
														
 
															+# }
														
 
															+
														
 
															+#default brat settings
														
 
															+CONFIG_BRAT = {
														
 
															+    "acl.conf":"""
														
 
															+User-agent: *
														
 
															+Allow: /
														
 
															+Disallow: /hidden/
														
 
															+
														
 
															+User-agent: guest
														
 
															+Disallow: /confidential/
														
 
															+""",
														
 
															+    "annotation.conf":"""
														
 
															+[spans]
														
 
															+product
														
 
															+code
														
 
															+name
														
 
															+money
														
 
															+	money_tendereeMoney
														
 
															+	money_tendererMoney
														
 
															+org
														
 
															+	org_tenderee
														
 
															+	org_agency
														
 
															+	org_tenderer
														
 
															+	org_secondTenderer
														
 
															+	org_thirdTenderer
														
 
															+company
														
 
															+	company_tenderee
														
 
															+	company_agency
														
 
															+	company_tenderer
														
 
															+	company_secondTenderer
														
 
															+	company_thirdTenderer
														
 
															+job
														
 
															+person
														
 
															+	person_tendereePerson
														
 
															+	person_agencyPerson
														
 
															+	person_person
														
 
															+	person_review
														
 
															+time
														
 
															+	time_release
														
 
															+	time_bidopen
														
 
															+	time_bidclose
														
 
															+location
														
 
															+package
														
 
															+phone
														
 
															+moneysource
														
 
															+bidway
														
 
															+serviceTime
														
 
															+[relations]
														
 
															+Equiv	Arg1:org|company|org_tenderee|org_agency|org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderee|company_agency|company_tenderer|company_secondTenderer|company_thirdTenderer, Arg2:org|company|org_tenderee|org_agency|org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderee|company_agency|company_tenderer|company_secondTenderer|company_thirdTenderer, <REL-TYPE>:symmetric-transitive
														
 
															+rel_tendererMoney	Arg1:org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderer|company_secondTenderer|company_thirdTenderer|org|company, Arg2:money_tendererMoney
														
 
															+rel_tendereeMoney	Arg1:package, Arg2:money_tendereeMoney|money
														
 
															+rel_person	Arg1:org|company|org_tenderee|org_agency|org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderee|company_agency|company_tenderer|company_secondTenderer|company_thirdTenderer, Arg2:person_tendereePerson|person_agencyPerson|person_person
														
 
															+rel_pack	Arg1:org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderer|company_secondTenderer|company_thirdTenderer, Arg2:package
														
 
															+rel_address	Arg1:org|company|org_tenderee|org_agency|org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderee|company_agency|company_tenderer|company_secondTenderer|company_thirdTenderer, Arg2:location
														
 
															+rel_phone	Arg1:person_tendereePerson|person_agencyPerson|person_person, Arg2:phone
														
 
															+rel_pack_code	Arg1:package, Arg2:code
														
 
															+rel_pack_name	Arg1:package, Arg2:name
														
 
															+
														
 
															+
														
 
															+[events]
														
 
															+#Protein_binding|GO:0005515	Theme+:Protein
														
 
															+#Gene_expression|GO:0010467	Theme:Protein
														
 
															+
														
 
															+[attributes]
														
 
															+#att_role	Arg:<ENTITY>, Value:招标人|代理人|中标人|第二候选人|第三候选人|att_noRole
														
 
															+#att_role	Arg:<ENTITY>, Value:att_tenderee|att_agency|att_tenderer|att_secondTenderer|att_thirdTenderer|att_noRole
														
 
															+#att_money	Arg:<ENTITY>, Value:att_tendereeMoney|att_tendererMoney|att_nomoney
														
 
															+#att_person	Arg:<ENTITY>, Value:att_noperson|att_tendereePerson|att_agencyPerson|att_person
														
 
															+#Negation	Arg:<EVENT>
														
 
															+#Speculation	Arg:<EVENT>
														
 
															+""",
														
 
															+    "visual.conf":"""
														
 
															+[labels]
														
 
															+code | 项目编号
														
 
															+name | 项目名称
														
 
															+org | 组织
														
 
															+company | 公司
														
 
															+job | 职业
														
 
															+person | 人名
														
 
															+time | 时间
														
 
															+location | 地址
														
 
															+package | 包号
														
 
															+phone | 电话
														
 
															+money | 金额
														
 
															+money_tendereeMoney | 招标金额
														
 
															+money_tendererMoney | 中投标金额
														
 
															+
														
 
															+org_tenderee | 招标人
														
 
															+org_agency | 代理人
														
 
															+org_tenderer | 中标人
														
 
															+org_secondTenderer | 第二候选人
														
 
															+org_thirdTenderer | 第三候选人
														
 
															+company_tenderee | 招标人
														
 
															+company_agency | 代理人
														
 
															+company_tenderer | 中标人
														
 
															+company_secondTenderer | 第二候选人
														
 
															+company_thirdTenderer | 第三候选人
														
 
															+
														
 
															+person_tendereePerson | 招标联系人
														
 
															+person_agencyPerson | 代理联系人
														
 
															+person_person | 联系人
														
 
															+
														
 
															+rel_tendererMoney | 中投标金额
														
 
															+rel_tendereeMoney | 招标金额
														
 
															+rel_person | 联系人
														
 
															+rel_pack | 所属包
														
 
															+rel_address | 地址
														
 
															+rel_phone | 联系电话
														
 
															+rel_pack_code | 包件编号
														
 
															+rel_pack_name | 包件名称
														
 
															+
														
 
															+person_review | 评审专家
														
 
															+time_release | 发布时间
														
 
															+time_bidopen | 开标时间
														
 
															+time_bidclose | 截标时间
														
 
															+moneysource | 资金来源
														
 
															+bidway | 招标方式
														
 
															+serviceTime | 服务期限
														
 
															+product | 产品
														
 
															+
														
 
															+#Protein | Protein | Pro | P
														
 
															+#Protein_binding | Protein binding | Binding | Bind
														
 
															+#Gene_expression | Gene expression | Expression | Exp
														
 
															+#Theme | Theme | Th
														
 
															+
														
 
															+[drawing]
														
 
															+Protein	bgColor:#7fa2ff
														
 
															+SPAN_DEFAULT	fgColor:black, bgColor:lightgreen, borderColor:black
														
 
															+ARC_DEFAULT	color:black
														
 
															+ATTRIBUTE_DEFAULT	glyph:*
														
 
															+""",
														
 
															+    "tools.conf":"""
														
 
															+[search]
														
 
															+google     <URL>:http://www.google.com/search?q=%s
														
 
															+""",
														
 
															+    "kb_shortcuts.conf":"""
														
 
															+P	Protein
														
 
															+"""
														
 
															+}
														
--- a/examples/product/test.sqlite
+++ b/examples/product/test.sqlite
--- a/examples/test/__init__.py
+++ b/examples/test/__init__.py
@@ -0,0 +1 @@
 
															+from . import rules
														
--- a/examples/test/annotation.conf
+++ b/examples/test/annotation.conf
@@ -0,0 +1,49 @@
 
															+# -*- Mode: Text; tab-width: 8; indent-tabs-mode: nil; coding: utf-8; -*-
														
 
															+# vim:set ft=conf ts=2 sw=2 sts=2 autoindent:
														
 
															+
														
 
															+# Simple text-based definitions of entity, relation and event types
														
 
															+# and event attributes for the BioNLP Shared Task 2011 EPI task.
														
 
															+
														
 
															+
														
 
															+[entities]
														
 
															+
														
 
															+Protein
														
 
															+	abc
														
 
															+Entity
														
 
															+
														
 
															+
														
 
															+[relations]
														
 
															+
														
 
															+Equiv	Arg1:Protein, Arg2:Protein, <REL-TYPE>:symmetric-transitive
														
 
															+Equiv	Arg1:Entity, Arg2:Entity, <REL-TYPE>:symmetric-transitive
														
 
															+
														
 
															+# (No entity nestings permitted in EPI. Could be defined using special
														
 
															+# relation type ENTITY-NESTING if necessary.)
														
 
															+
														
 
															+
														
 
															+[events]
														
 
															+
														
 
															+Catalysis	Theme:<EVENT>, Cause:Protein
														
 
															+----------------------------------------
														
 
															+DNA_methylation|GO:0006306	Theme:Protein, Site?:Entity
														
 
															+DNA_demethylation|GO:0080111	Theme:Protein, Site?:Entity
														
 
															+----------------------------------------
														
 
															+Acetylation|GO:0006473	Theme:Protein, Site?:Entity, Contextgene?:Protein
														
 
															+Methylation|GO:0006479	Theme:Protein, Site?:Entity, Contextgene?:Protein
														
 
															+Glycosylation|GO:0006486	Theme:Protein, Site?:Entity, Sidechain?:Entity
														
 
															+Hydroxylation|GO:0018126	Theme:Protein, Site?:Entity
														
 
															+Phosphorylation|GO:0006468	Theme:Protein, Site?:Entity
														
 
															+Ubiquitination|GO:0016567	Theme:Protein, Site?:Entity
														
 
															+----------------------------------------
														
 
															+Deacetylation|GO:0006476	Theme:Protein, Site?:Entity, Contextgene?:Protein
														
 
															+Demethylation|GO:0006482	Theme:Protein, Site?:Entity, Contextgene?:Protein
														
 
															+Deglycosylation|GO:0006517	Theme:Protein, Site?:Entity, Sidechain?:Entity
														
 
															+Dehydroxylation|GO:-------	Theme:Protein, Site?:Entity
														
 
															+Dephosphorylation|GO:0006470	Theme:Protein, Site?:Entity
														
 
															+Deubiquitination|GO:0016579	Theme:Protein, Site?:Entity
														
 
															+
														
 
															+
														
 
															+[attributes]
														
 
															+
														
 
															+Negation	Arg:<EVENT>
														
 
															+Speculation	Arg:<EVENT>
														
--- a/examples/test/articles.csv
+++ b/examples/test/articles.csv
--- a/examples/test/bin/2020-08-01-2020-08-31要素标注统计.xls
+++ b/examples/test/bin/2020-08-01-2020-08-31要素标注统计.xls
--- a/examples/test/bin/None-2020-09-25要素标注统计.xls
+++ b/examples/test/bin/None-2020-09-25要素标注统计.xls
--- a/examples/test/bin/None-2020-10-31要素标注统计.xls
+++ b/examples/test/bin/None-2020-10-31要素标注统计.xls
--- a/examples/test/bin/None-2020-11-25要素标注统计.xls
+++ b/examples/test/bin/None-2020-11-25要素标注统计.xls
--- a/examples/test/bin/None-2020-12-25要素标注统计.xls
+++ b/examples/test/bin/None-2020-12-25要素标注统计.xls
--- a/examples/test/bin/csv_to_iepy.py
+++ b/examples/test/bin/csv_to_iepy.py
@@ -0,0 +1,28 @@
 
															+"""
														
 
															+IEPY database loader from csv file
														
 
															+
														
 
															+Usage:
														
 
															+    csv_to_iepy.py <filename>
														
 
															+    csv_to_iepy.py -h | --help
														
 
															+
														
 
															+The <filename> argument can be a .csv file or a .csv.gz file containing the
														
 
															+corpus in two columns: 'freebase_mid' and 'description'.
														
 
															+
														
 
															+Options:
														
 
															+  -h --help             Show this screen
														
 
															+  --version             Version number
														
 
															+"""
														
 
															+
														
 
															+import logging
														
 
															+
														
 
															+from docopt import docopt
														
 
															+
														
 
															+import iepy
														
 
															+iepy.setup(__file__)
														
 
															+from iepy.utils import csv_to_iepy
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    logging.basicConfig(level=logging.INFO, format='%(message)s')
														
 
															+    opts = docopt(__doc__, version=iepy.__version__)
														
 
															+    filepath = opts["<filename>"]
														
 
															+    csv_to_iepy(filepath)
														
--- a/examples/test/bin/gazettes_loader.py
+++ b/examples/test/bin/gazettes_loader.py
@@ -0,0 +1,76 @@
 
															+"""
														
 
															+IEPY gazettes loader
														
 
															+
														
 
															+Usage:
														
 
															+    gazettes_loader.py <filename>
														
 
															+
														
 
															+
														
 
															+The <filename> argument can be a .csv file or a .csv.gz file containing the
														
 
															+gazettes in two columns: 'literal' and 'class'.
														
 
															+
														
 
															+
														
 
															+Options:
														
 
															+  -h --help             Show this screen
														
 
															+"""
														
 
															+
														
 
															+import sys
														
 
															+import csv
														
 
															+import gzip
														
 
															+import logging
														
 
															+from operator import itemgetter
														
 
															+
														
 
															+from django.db import IntegrityError
														
 
															+from docopt import docopt
														
 
															+
														
 
															+import iepy
														
 
															+iepy.setup(__file__)
														
 
															+from iepy.data.models import EntityKind, GazetteItem
														
 
															+
														
 
															+logging.basicConfig(level=logging.INFO, format='%(message)s')
														
 
															+
														
 
															+
														
 
															+def add_gazettes_from_csv(filepath):
														
 
															+    if filepath.endswith(".gz"):
														
 
															+        fin = gzip.open(filepath, "rt")
														
 
															+    else:
														
 
															+        fin = open(filepath, "rt")
														
 
															+    reader = csv.DictReader(fin)
														
 
															+
														
 
															+    expected_fnames = ['literal', 'class']
														
 
															+    if not set(reader.fieldnames).issuperset(expected_fnames):
														
 
															+        msg = "Couldn't find the expected field names on the provided csv: {}"
														
 
															+        sys.exit(msg.format(expected_fnames))
														
 
															+
														
 
															+    _create_gazette_entries(
														
 
															+        itemgetter(*expected_fnames)(line) for line in reader
														
 
															+    )
														
 
															+
														
 
															+
														
 
															+def _create_gazette_entries(entries_list):
														
 
															+    kind_cache = {}
														
 
															+    created = 0
														
 
															+    for literal, kind_name in entries_list:
														
 
															+        literal = literal.strip()
														
 
															+        kind_name = kind_name.strip()
														
 
															+        kind = kind_cache.get(kind_name)
														
 
															+        if kind is None:
														
 
															+            kind, _ = EntityKind.objects.get_or_create(name=kind_name)
														
 
															+            kind_cache[kind_name] = kind
														
 
															+        gazette = GazetteItem(text=literal, kind=kind)
														
 
															+
														
 
															+        try:
														
 
															+            gazette.save()
														
 
															+        except IntegrityError as error:
														
 
															+            logging.warn(
														
 
															+                "Gazette '{}' of class '{}' not loaded, literal already existed".format(
														
 
															+                literal, kind_name))
														
 
															+            print(error)
														
 
															+        finally:
														
 
															+            created += 1
														
 
															+    print('Created {} new gazette items'.format(created))
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    opts = docopt(__doc__, version=iepy.__version__)
														
 
															+    fname = opts["<filename>"]
														
 
															+    add_gazettes_from_csv(fname)
														
--- a/examples/test/bin/iepy_rules_runner.py
+++ b/examples/test/bin/iepy_rules_runner.py
@@ -0,0 +1,59 @@
 
															+"""
														
 
															+Run IEPY rule-based extractor
														
 
															+
														
 
															+Usage:
														
 
															+    iepy_rules_runner.py
														
 
															+    iepy_rules_runner.py -h | --help | --version
														
 
															+
														
 
															+Picks from rules.py the relation to work with, and the rules definitions and
														
 
															+proceeds with the extraction.
														
 
															+
														
 
															+Options:
														
 
															+  -h --help             Show this screen
														
 
															+  --version             Version number
														
 
															+"""
														
 
															+import sys
														
 
															+import logging
														
 
															+
														
 
															+from django.core.exceptions import ObjectDoesNotExist
														
 
															+
														
 
															+import iepy
														
 
															+iepy.setup(__file__)
														
 
															+
														
 
															+from iepy.extraction.rules import load_rules
														
 
															+from iepy.extraction.rules_core import RuleBasedCore
														
 
															+from iepy.data import models, output
														
 
															+from iepy.data.db import CandidateEvidenceManager
														
 
															+
														
 
															+
														
 
															+def run_from_command_line():
														
 
															+    logging.basicConfig(level=logging.INFO, format='%(message)s')
														
 
															+
														
 
															+    try:
														
 
															+        relation_name = iepy.instance.rules.RELATION
														
 
															+    except AttributeError:
														
 
															+        logging.error("RELATION not defined in rules file")
														
 
															+        sys.exit(1)
														
 
															+
														
 
															+    try:
														
 
															+        relation = models.Relation.objects.get(name=relation_name)
														
 
															+    except ObjectDoesNotExist:
														
 
															+        logging.error("Relation {!r} not found".format(relation_name))
														
 
															+        sys.exit(1)
														
 
															+
														
 
															+    # Load rules
														
 
															+    rules = load_rules()
														
 
															+
														
 
															+    # Load evidences
														
 
															+    evidences = CandidateEvidenceManager.candidates_for_relation(relation)
														
 
															+
														
 
															+    # Run the pipeline
														
 
															+    iextractor = RuleBasedCore(relation, rules)
														
 
															+    iextractor.start()
														
 
															+    iextractor.process()
														
 
															+    predictions = iextractor.predict(evidences)
														
 
															+    output.dump_output_loop(predictions)
														
 
															+
														
 
															+
														
 
															+if __name__ == u'__main__':
														
 
															+    run_from_command_line()
														
--- a/examples/test/bin/iepy_runner.py
+++ b/examples/test/bin/iepy_runner.py
@@ -0,0 +1,184 @@
 
															+"""
														
 
															+Run IEPY active-learning extractor
														
 
															+
														
 
															+Usage:
														
 
															+    iepy_runner.py [options] <relation_name> <output>
														
 
															+    iepy_runner.py [options] --db-store <relation_name>
														
 
															+    iepy_runner.py -h | --help | --version
														
 
															+
														
 
															+Options:
														
 
															+  --store-extractor=<extractor_output>     Stores the trained classifier
														
 
															+  --trained-extractor=<extractor_path>     Load an already trained extractor
														
 
															+  --db-store                               Stores the predictions on the database
														
 
															+  --no-questions                           Won't generate questions to answer. Will predict
														
 
															+                                           as is. Should be used with --trained-extractor
														
 
															+  --tune-for=<tune-for>                    Predictions tuning. Options are high-prec
														
 
															+                                           or high-recall [default: high-prec]
														
 
															+  --extractor-config=<config.json>         Sets the extractor config
														
 
															+  --version                                Version number
														
 
															+  -h --help                                Show this screen
														
 
															+"""
														
 
															+
														
 
															+import os
														
 
															+import json
														
 
															+import logging
														
 
															+from docopt import docopt
														
 
															+from sys import exit
														
 
															+
														
 
															+import iepy
														
 
															+INSTANCE_PATH = iepy.setup(__file__)
														
 
															+
														
 
															+from iepy.extraction.active_learning_core import ActiveLearningCore, HIPREC, HIREC
														
 
															+from iepy.data.db import CandidateEvidenceManager
														
 
															+from iepy.data.models import Relation
														
 
															+from iepy.extraction.terminal import TerminalAdministration
														
 
															+from iepy.data import output
														
 
															+
														
 
															+
														
 
															+def print_all_relations():
														
 
															+    print("All available relations:")
														
 
															+    for relation in Relation.objects.all():
														
 
															+        print("  {}".format(relation))
														
 
															+
														
 
															+
														
 
															+def load_labeled_evidences(relation, evidences):
														
 
															+    CEM = CandidateEvidenceManager  # shorcut
														
 
															+    return CEM.labels_for(relation, evidences, CEM.conflict_resolution_newest_wins)
														
 
															+
														
 
															+
														
 
															+def _get_tuning_mode(opts):
														
 
															+    if opts['--tune-for'] == 'high-prec':
														
 
															+        tuning_mode = HIPREC
														
 
															+    elif opts['--tune-for'] == 'high-recall':
														
 
															+        tuning_mode = HIREC
														
 
															+    else:
														
 
															+        print ('Invalid tuning mode')
														
 
															+        print (__doc__)
														
 
															+        exit(1)
														
 
															+    return tuning_mode
														
 
															+
														
 
															+
														
 
															+def _get_relation(opts):
														
 
															+    relation_name = opts['<relation_name>']
														
 
															+    try:
														
 
															+        relation = Relation.objects.get(name=relation_name)
														
 
															+    except Relation.DoesNotExist:
														
 
															+        print("Relation {!r} non existent".format(relation_name))
														
 
															+        print_all_relations()
														
 
															+        exit(1)
														
 
															+    return relation
														
 
															+
														
 
															+
														
 
															+def _load_extractor(opts, relation, labeled_evidences):
														
 
															+    extractor_path = opts.get('--trained-extractor')
														
 
															+    try:
														
 
															+        iextractor = ActiveLearningCore.load(extractor_path,
														
 
															+                                             labeled_evidences=labeled_evidences)
														
 
															+    except ValueError:
														
 
															+        print("Error: unable to load extractor, invalid file")
														
 
															+        exit(1)
														
 
															+
														
 
															+    if iextractor.relation != relation:
														
 
															+        print('The loaded extractor is not for the requested relation'
														
 
															+              ' but for relation {} instead'.format(iextractor.relation))
														
 
															+        exit(1)
														
 
															+    print('Extractor successfully loaded')
														
 
															+    return iextractor
														
 
															+
														
 
															+
														
 
															+def _construct_extractor(opts, relation, labeled_evidences, tuning_mode):
														
 
															+    config_filepath = opts.get("--extractor-config")
														
 
															+    if not config_filepath:
														
 
															+        config_filepath = os.path.join(INSTANCE_PATH, "extractor_config.json")
														
 
															+
														
 
															+    if not os.path.exists(config_filepath):
														
 
															+        print("Error: extractor config does not exists, please create the "
														
 
															+              "file extractor_config.json or use the --extractor-config")
														
 
															+        exit(1)
														
 
															+
														
 
															+    with open(config_filepath) as filehandler:
														
 
															+        try:
														
 
															+            extractor_config = json.load(filehandler)
														
 
															+        except Exception as error:
														
 
															+            print("Error: unable to load extractor config: {}".format(error))
														
 
															+            exit(1)
														
 
															+
														
 
															+    iextractor = ActiveLearningCore(
														
 
															+        relation, labeled_evidences, extractor_config, tradeoff=tuning_mode
														
 
															+    )
														
 
															+    return iextractor
														
 
															+
														
 
															+
														
 
															+def run_from_command_line():
														
 
															+    opts = docopt(__doc__, version=iepy.__version__)
														
 
															+
														
 
															+    logging.basicConfig(level=logging.INFO, format='%(message)s')
														
 
															+    logging.getLogger("featureforge").setLevel(logging.WARN)
														
 
															+
														
 
															+    tuning_mode = _get_tuning_mode(opts)
														
 
															+    relation = _get_relation(opts)
														
 
															+
														
 
															+    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
														
 
															+    labeled_evidences = load_labeled_evidences(relation, candidates)
														
 
															+
														
 
															+    if opts.get('--trained-extractor'):
														
 
															+        iextractor = _load_extractor(opts, relation, labeled_evidences)
														
 
															+        was_ever_trained = True
														
 
															+        opts["--no-questions"] = True
														
 
															+    else:
														
 
															+        iextractor = _construct_extractor(opts, relation, labeled_evidences, tuning_mode)
														
 
															+        iextractor.start()
														
 
															+        was_ever_trained = False
														
 
															+
														
 
															+    if not opts.get("--no-questions", False):
														
 
															+        questions_loop(iextractor, relation, was_ever_trained)
														
 
															+
														
 
															+    # Candidates generator was consumed when generating labeled_evidences, so we'll
														
 
															+    # define it fresh again
														
 
															+    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
														
 
															+    # Predict and store output
														
 
															+    predictions = iextractor.predict(candidates)  # asking predictions for EVERYTHING
														
 
															+    if not predictions:
														
 
															+        print("Nothing was predicted")
														
 
															+        exit(1)
														
 
															+
														
 
															+    if opts.get("--db-store"):
														
 
															+        output.dump_predictions_to_database(relation, predictions)
														
 
															+
														
 
															+    output_file = opts.get("<output>")
														
 
															+    if output_file:
														
 
															+        output.dump_runner_output_to_csv(predictions, output_file)
														
 
															+
														
 
															+    classifier_output = opts.get("--store-extractor")
														
 
															+    if classifier_output:
														
 
															+        iextractor.save(classifier_output)
														
 
															+
														
 
															+
														
 
															+def questions_loop(iextractor, relation, was_ever_trained):
														
 
															+    STOP = u'STOP'
														
 
															+    term = TerminalAdministration(
														
 
															+        relation,
														
 
															+        extra_options=[(STOP, u'Stop execution')]
														
 
															+    )
														
 
															+    while iextractor.questions:
														
 
															+        questions = list(iextractor.questions)  # copying the list
														
 
															+        term.update_candidate_evidences_to_label(questions)
														
 
															+        result = term()
														
 
															+        i = 0
														
 
															+        for c, label_value in load_labeled_evidences(relation, questions).items():
														
 
															+            if label_value is not None:
														
 
															+                iextractor.add_answer(c, label_value)
														
 
															+                i += 1
														
 
															+        print ('Added %s new human labels to the extractor core' % i)
														
 
															+        iextractor.process()
														
 
															+        was_ever_trained = True
														
 
															+        if result == STOP:
														
 
															+            break
														
 
															+
														
 
															+    if not was_ever_trained:
														
 
															+        # It's needed to run some process before asking for predictions
														
 
															+        iextractor.process()
														
 
															+
														
 
															+
														
 
															+if __name__ == u'__main__':
														
 
															+    run_from_command_line()
														
--- a/examples/test/bin/manage.py
+++ b/examples/test/bin/manage.py
@@ -0,0 +1,12 @@
 
															+#!/usr/bin/env python
														
 
															+
														
 
															+import sys
														
 
															+
														
 
															+from django.core.management import execute_from_command_line
														
 
															+
														
 
															+import iepy
														
 
															+iepy.setup(__file__)
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    execute_from_command_line(sys.argv)
														
--- a/examples/test/bin/preprocess.py
+++ b/examples/test/bin/preprocess.py
@@ -0,0 +1,96 @@
 
															+"""
														
 
															+Corpus preprocessing script
														
 
															+
														
 
															+Usage:
														
 
															+    preprocess.py [options]
														
 
															+    preprocess.py --split-in=<num-splits> --run-part=<num-part>
														
 
															+    preprocess.py --increment-ner
														
 
															+    preprocess.py -h | --help | --version
														
 
															+
														
 
															+Options:
														
 
															+  -h --help                      Show this screen
														
 
															+  --multiple-cores=<num-cores>   Number of cores (use all to use every processor)
														
 
															+  --increment-ner                Re run NER and Gazetter for every document. If a document lacked any of the previous steps, will be preprocessed entirely.
														
 
															+  --version                      Version number
														
 
															+"""
														
 
															+import logging
														
 
															+
														
 
															+from docopt import docopt
														
 
															+
														
 
															+import os
														
 
															+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
														
 
															+os.environ["CUDA_VISIBLE_DEVICES"] = ""
														
 
															+
														
 
															+import iepy
														
 
															+import multiprocessing
														
 
															+iepy.setup(__file__)
														
 
															+from iepy.data.db import DocumentManager
														
 
															+from iepy.selfpreprocess.self_preprocess import SelfPreprocesser
														
 
															+from iepy.selfpreprocess.pipeline import PreProcessPipeline, PreProcessSteps
														
 
															+# from iepy.preprocess.stanford_preprocess import StanfordPreprocess
														
 
															+# from iepy.preprocess.pipeline import PreProcessPipeline, PreProcessSteps
														
 
															+# from iepy.preprocess.segmenter import SyntacticSegmenterRunner
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+class ParallelDocManager(DocumentManager):
														
 
															+
														
 
															+    def mines_of(self, qset, number_of_processors, my_id):
														
 
															+        K = number_of_processors
														
 
															+        N = my_id
														
 
															+        clause = 'id %%%% %s = %s' % (K, N)
														
 
															+        return qset.extra(where=[clause])
														
 
															+
														
 
															+def start_preprocess(docs, increment_ner):
														
 
															+    pipeline = PreProcessPipeline([
														
 
															+        SelfPreprocesser(increment_ner),
														
 
															+        # SyntacticSegmenterRunner(increment=True)
														
 
															+    ], docs)
														
 
															+    pipeline.process_everything()
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    logger = logging.getLogger(u'preprocess')
														
 
															+    logger.setLevel(logging.INFO)
														
 
															+    logging.basicConfig(level=logging.INFO, format='%(message)s')
														
 
															+    opts = docopt(__doc__, version=iepy.__version__)
														
 
															+    increment_ner = opts['--increment-ner']
														
 
															+
														
 
															+    dm = ParallelDocManager()
														
 
															+    all_docs = dm.get_documents_lacking_preprocess(
														
 
															+        [PreProcessSteps.brat])
														
 
															+
														
 
															+    multiple_cores = opts.get('--multiple-cores')
														
 
															+    split_in = opts.get("--split-in")
														
 
															+    run_part = opts.get("--run-part")
														
 
															+
														
 
															+    if multiple_cores:
														
 
															+        if multiple_cores == "all":
														
 
															+            multiple_cores = multiprocessing.cpu_count()
														
 
															+        try:
														
 
															+            multiple_cores = int(multiple_cores)
														
 
															+        except ValueError:
														
 
															+            logger.error("Invalid number of cores")
														
 
															+            exit(1)
														
 
															+
														
 
															+        for i in range(multiple_cores):
														
 
															+            process = multiprocessing.Process(
														
 
															+                target=start_preprocess, args=(dm.mines_of(all_docs, multiple_cores, i), increment_ner)
														
 
															+            )
														
 
															+            process.start()
														
 
															+    elif split_in:
														
 
															+        try:
														
 
															+            split_in = int(split_in)
														
 
															+            run_part = int(run_part) - 1
														
 
															+        except ValueError:
														
 
															+            logger.error("Invalid split")
														
 
															+            exit(1)
														
 
															+
														
 
															+        if run_part < 0 or run_part > split_in:
														
 
															+            logger.error("Parts must be between 1 and {}".format(split_in))
														
 
															+            exit(1)
														
 
															+
														
 
															+        docs = dm.mines_of(all_docs, split_in, run_part)
														
 
															+        start_preprocess(docs, increment_ner)
														
 
															+    else:
														
 
															+        start_preprocess(all_docs, increment_ner)
														
--- a/examples/test/bin/rules_verifier.py
+++ b/examples/test/bin/rules_verifier.py
@@ -0,0 +1,149 @@
 
															+"""
														
 
															+IEPY rules verifier
														
 
															+
														
 
															+
														
 
															+Usage:
														
 
															+    rules_verifier.py <relation> [options]
														
 
															+
														
 
															+Options:
														
 
															+  --shuffle             Chooses the sample randomly and not the first ones
														
 
															+  --create-evidences    Creates evidences that are missing [default: false]
														
 
															+  -r --rule=<rule>      Tests only this rule
														
 
															+  -l --limit=<limit>    Limits the amount of evidences uses
														
 
															+  -h --help             Show this screen
														
 
															+"""
														
 
															+
														
 
															+import sys
														
 
															+import logging
														
 
															+from docopt import docopt
														
 
															+
														
 
															+import refo
														
 
															+from django.core.exceptions import ObjectDoesNotExist
														
 
															+from colorama import init as colorama_init
														
 
															+
														
 
															+import iepy
														
 
															+iepy.setup(__file__)
														
 
															+
														
 
															+from iepy.data import models
														
 
															+from iepy.data.models import EvidenceCandidate
														
 
															+from iepy.data.db import CandidateEvidenceManager
														
 
															+from iepy.extraction.terminal import TerminalEvidenceFormatter
														
 
															+from iepy.extraction.rules import (
														
 
															+    load_rules, compile_rule, generate_tokens_to_match
														
 
															+)
														
 
															+from iepy.metrics import result_dict_from_predictions
														
 
															+
														
 
															+
														
 
															+logging.basicConfig(level=logging.INFO, format='%(message)s')
														
 
															+
														
 
															+
														
 
															+def run_from_command_line():
														
 
															+    opts = docopt(__doc__, version=iepy.__version__)
														
 
															+    relation_name = opts.get("<relation>")
														
 
															+    limit = opts.get("--limit")
														
 
															+    rule_name = opts.get("--rule")
														
 
															+    shuffle = opts.get("--shuffle")
														
 
															+    create_evidences = opts.get("--create-evidences")
														
 
															+
														
 
															+    if limit is None:
														
 
															+        limit = -1
														
 
															+
														
 
															+    try:
														
 
															+        limit = int(limit)
														
 
															+    except ValueError:
														
 
															+        logging.error("Invalid limit value, it must be a number")
														
 
															+        sys.exit(1)
														
 
															+
														
 
															+    try:
														
 
															+        relation = models.Relation.objects.get(name=relation_name)
														
 
															+    except ObjectDoesNotExist:
														
 
															+        logging.error("Relation {!r} not found".format(relation_name))
														
 
															+        sys.exit(1)
														
 
															+
														
 
															+    # Load rules
														
 
															+    rules = get_rules(rule_name)
														
 
															+    rule_regexes = [
														
 
															+        (rule.__name__, compile_rule(rule, relation), rule.answer) for rule in rules
														
 
															+    ]
														
 
															+
														
 
															+    # Load evidences
														
 
															+    if EvidenceCandidate.objects.all().count() == 0:
														
 
															+        create_evidences = True
														
 
															+    evidences = CandidateEvidenceManager.candidates_for_relation(
														
 
															+        relation, create_evidences, seg_limit=limit, shuffle_segs=shuffle
														
 
															+    )
														
 
															+    conflict_solver = CandidateEvidenceManager.conflict_resolution_newest_wins
														
 
															+    answers = CandidateEvidenceManager.labels_for(
														
 
															+        relation, evidences, conflict_solver
														
 
															+    )
														
 
															+    run_tests(rule_regexes, evidences, answers)
														
 
															+
														
 
															+
														
 
															+def run_tests(rule_regexes, evidences, answers):
														
 
															+    predictions = []
														
 
															+    real_labels = []
														
 
															+    evidences_with_labels = []
														
 
															+
														
 
															+    colorama_init()
														
 
															+    formatter = TerminalEvidenceFormatter()
														
 
															+
														
 
															+    for name, regex, answer in rule_regexes:
														
 
															+        title = "Matches for rule '{}' (value: {})".format(name, answer)
														
 
															+        print("\n{}\n{}".format(title, "-" * len(title)))
														
 
															+
														
 
															+        anything_matched = False
														
 
															+        for evidence in evidences:
														
 
															+            tokens_to_match = generate_tokens_to_match(evidence)
														
 
															+            match = refo.match(regex, tokens_to_match)
														
 
															+
														
 
															+            if match:
														
 
															+                anything_matched = True
														
 
															+                print("  * {}".format(formatter.colored_text(evidence)))
														
 
															+
														
 
															+            if evidence in answers and answers[evidence] is not None:
														
 
															+                evidences_with_labels.append(evidence)
														
 
															+                real_labels.append(answers[evidence])
														
 
															+
														
 
															+                if match:
														
 
															+                    predictions.append(answer)
														
 
															+                else:
														
 
															+                    predictions.append(False)
														
 
															+
														
 
															+        if not anything_matched:
														
 
															+            print("  nothing matched")
														
 
															+
														
 
															+        print()
														
 
															+
														
 
															+    if real_labels:
														
 
															+        results = result_dict_from_predictions(
														
 
															+            evidences_with_labels, real_labels, predictions
														
 
															+        )
														
 
															+        results.pop("end_time")
														
 
															+        keys = [
														
 
															+            "true_positives", "true_negatives",
														
 
															+            "false_positives", "false_negatives",
														
 
															+            "precision", "recall",
														
 
															+            "accuracy", "f1",
														
 
															+        ]
														
 
															+
														
 
															+        title = "Metrics"
														
 
															+        print("{}\n{}".format(title, "-" * len(title)))
														
 
															+        for key in keys:
														
 
															+            print("{:>15}: {:.2f}".format(key, results[key]))
														
 
															+
														
 
															+
														
 
															+def get_rules(rule_name):
														
 
															+    # Load rules
														
 
															+    rules = load_rules()
														
 
															+
														
 
															+    if rule_name:
														
 
															+        rules = [x for x in rules if x.__name__ == rule_name]
														
 
															+        if not rules:
														
 
															+            logging.error("rule '{}' does not exists".format(rule_name))
														
 
															+            sys.exit(1)
														
 
															+
														
 
															+    return rules
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    run_from_command_line()
														
--- a/examples/test/bin/settlement.py
+++ b/examples/test/bin/settlement.py
@@ -0,0 +1,251 @@
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+from django.db.models import Q
														
 
															+import datetime,time
														
 
															+import iepy
														
 
															+iepy.setup(__file__)
														
 
															+from iepy.data.db import DocumentManager
														
 
															+from iepy.data.models import IEDocument,LabeledIEDocument,IEDocumentMetadata,LabeledIEDocumentMetadata,Payroll
														
 
															+from brat.models import BratAnnotation,LabeledBratAnnotation
														
 
															+from django.db import transaction
														
 
															+import pandas as pd
														
 
															+from django.contrib.auth.models import User
														
 
															+
														
 
															+def object_to_dict(obj,class_model):
														
 
															+    '''
														
 
															+    :param obj:对象
														
 
															+    :param class_model:django model
														
 
															+    :return: 由对象生成的键值对
														
 
															+    '''
														
 
															+    _dict = {}
														
 
															+    concrete_model = class_model._meta.concrete_model
														
 
															+    for field in concrete_model._meta.local_fields:
														
 
															+        value = field.value_from_object(obj)
														
 
															+        _dict[field.name] = value
														
 
															+    return _dict
														
 
															+
														
 
															+
														
 
															+class Settlement():
														
 
															+
														
 
															+    '''
														
 
															+    @summary: 结算类，定义了结算者所需要执行的各种方法
														
 
															+    '''
														
 
															+
														
 
															+    def makePayroll(self,list_user,begin_time,time_end):
														
 
															+        '''
														
 
															+        :param _user: 用户名
														
 
															+        :param time_begin: 起始时间
														
 
															+        :param time_end: 截至时间
														
 
															+        :return:根据用户，时间段生成用户的标注情况
														
 
															+        '''
														
 
															+        from django.db import connection
														
 
															+        with transaction.atomic():
														
 
															+            cursor = connection.cursor()
														
 
															+            time_begin = "2020-08-01"
														
 
															+            for _user in list(set(list_user)):
														
 
															+                sql = 'select max(end_time) from corpus_payroll where "user"=\'%s\''%(_user)
														
 
															+                cursor.execute(sql)
														
 
															+                rows = cursor.fetchall()
														
 
															+                if rows[0][0] is not None:
														
 
															+                    time_begin = rows[0][0]
														
 
															+                else:
														
 
															+                    time_begin = "2020-08-01"
														
 
															+
														
 
															+                sql = " select count(1) from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>'%s'"%(_user,time_end,time_begin)
														
 
															+                cursor.execute(sql)
														
 
															+                doc_count = cursor.fetchall()[0][0]
														
 
															+                sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>'%s' order by edittime desc limit 1200) and value like '%s' "%(_user,time_end,time_begin,"T%")
														
 
															+                cursor.execute(sql)
														
 
															+                t_count = cursor.fetchall()[0][0]
														
 
															+                sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>'%s' order by edittime desc limit 1200) and value like '%s' "%(_user,time_end,time_begin,"R%")
														
 
															+                cursor.execute(sql)
														
 
															+                r_count = cursor.fetchall()[0][0]
														
 
															+                sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>'%s') and value like '%s' "%(_user,time_end,time_begin,"T%")
														
 
															+                cursor.execute(sql)
														
 
															+                all_t_count = cursor.fetchall()[0][0]
														
 
															+                sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>'%s') and value like '%s' "%(_user,time_end,time_begin,"R%")
														
 
															+                cursor.execute(sql)
														
 
															+                all_r_count = cursor.fetchall()[0][0]
														
 
															+                wage = round(0.03*t_count+0.05*r_count+(all_t_count-t_count)*0.04+(all_r_count-r_count)*0.06,2)
														
 
															+                print(doc_count,t_count,r_count,wage)
														
 
															+                payrolls = Payroll.objects.filter(Q(user=_user)& Q(begin_time=time_begin) & Q(end_time=time_end))
														
 
															+                if len(payrolls)==0:
														
 
															+                    _payroll = Payroll.objects.create(**{"user":_user,"doc_count":doc_count,"begin_time":time_begin,"end_time":time_end,"t_count":all_t_count,"r_count":all_r_count,"wage":wage})
														
 
															+                    _payroll.save()
														
 
															+                else:
														
 
															+                    _payroll = payrolls[0]
														
 
															+                    _payroll.doc_count = doc_count
														
 
															+                    _payroll.t_count = all_t_count
														
 
															+                    _payroll.r_count = all_r_count
														
 
															+                    _payroll.wage = wage
														
 
															+                    _payroll.save()
														
 
															+
														
 
															+    def exportPayroll(self,begin_time,end_time):
														
 
															+        '''
														
 
															+        :param begin_time: 导出开始时间
														
 
															+        :param end_time: 导出结束时间
														
 
															+        :return:
														
 
															+        '''
														
 
															+        list_user = []
														
 
															+        list_doc_count = []
														
 
															+        list_t_count = []
														
 
															+        list_r_count = []
														
 
															+        list_wage = []
														
 
															+        list_yield = []
														
 
															+        list_account = []
														
 
															+        list_begin = []
														
 
															+        list_end = []
														
 
															+        if begin_time is not None:
														
 
															+            payrolls = Payroll.objects.filter(Q(begin_time=begin_time) & Q(end_time=end_time))
														
 
															+        else:
														
 
															+            payrolls = Payroll.objects.filter(Q(end_time=end_time))
														
 
															+        for _payroll in payrolls:
														
 
															+            list_user.append(_payroll.user)
														
 
															+            list_doc_count.append(_payroll.doc_count)
														
 
															+            list_t_count.append(_payroll.t_count)
														
 
															+            list_r_count.append(_payroll.r_count)
														
 
															+            list_wage.append(_payroll.wage)
														
 
															+            list_yield.append(_payroll._yield)
														
 
															+            list_account.append(_payroll.account)
														
 
															+            list_begin.append(_payroll.begin_time)
														
 
															+            list_end.append(_payroll.end_time)
														
 
															+        df = pd.DataFrame({"用户":list_user,"开始时间":list_begin,"结束时间":list_end,"文章数":list_doc_count,"要素数":list_t_count,"关系数":list_r_count,"总价":list_wage,"合格率":list_yield,"结算价":list_account})
														
 
															+        df.to_excel("%s-%s要素标注统计.xls"%(begin_time,end_time),columns=["用户","开始时间","结束时间","文章数","要素数","关系数","总价","合格率","结算价"])
														
 
															+
														
 
															+    def getAllUser(self):
														
 
															+        from django.db import connection
														
 
															+        with transaction.atomic():
														
 
															+            list_user = []
														
 
															+            cursor = connection.cursor()
														
 
															+            sql = "select username from auth_user where is_staff='t'"
														
 
															+            cursor.execute(sql)
														
 
															+            for row in cursor.fetchall():
														
 
															+                list_user.append(row[0])
														
 
															+            return list_user
														
 
															+
														
 
															+
														
 
															+    def makeMigrate(self,_user,time_begin,time_end):
														
 
															+        '''
														
 
															+        :param _user: 用户名
														
 
															+        :param time_begin: 起始时间
														
 
															+        :param time_end: 截至时间
														
 
															+        :return: 将用户在时间段内的数据迁移到标准表中
														
 
															+        '''
														
 
															+        pass
														
 
															+        # from django.db import connection
														
 
															+        # with transaction.atomic():
														
 
															+        #     cursor = connection.cursor()
														
 
															+        #     sql = " select human_identifier,offsets_to_text,sentences from corpus_iedocument where edituser is null"
														
 
															+        #     cursor.execute(sql)
														
 
															+        #     cursor1 = connection.cursor()
														
 
															+        #     _index = 0
														
 
															+        #     rows = True
														
 
															+        #     while(rows):
														
 
															+        #         rows=cursor.fetchmany(1000)
														
 
															+        #         for row in rows:
														
 
															+        #             _index += 1
														
 
															+        #             print(_index)
														
 
															+        #             human_identifier,offsets_to_text,sentences = row
														
 
															+        #             if sentences!="[]":
														
 
															+        #                 _off = offsets_to_text.split(", ")[-1][:-1]
														
 
															+        #                 _sen = sentences.split(", ")[-1][:-1]
														
 
															+        #                 print(_off,_sen)
														
 
															+        #                 if int(_off)!=int(_sen):
														
 
															+        #                     offsets_to_text = offsets_to_text[:-1]+", "+str(int(_sen))+"]"
														
 
															+        #                     print(offsets_to_text)
														
 
															+        #                     cursor1.execute("update corpus_iedocument set offsets_to_text='%s' where human_identifier='%s'"%(offsets_to_text,human_identifier))
														
 
															+
														
 
															+
														
 
															+
														
 
															+            # ieDocuments = IEDocument.objects.filter(Q(edituser=_user) & Q(edittime__range=(time_begin,time_end)))
														
 
															+            # for obj in ieDocuments:
														
 
															+            #     _dict = object_to_dict(obj,IEDocument)
														
 
															+            #     _dict_meta = object_to_dict(obj.metadata,IEDocumentMetadata)
														
 
															+            #     labeledMeta = LabeledIEDocumentMetadata.objects.create(**_dict_meta)
														
 
															+            #     labeledMeta.save()
														
 
															+            #     _dict["metadata"] = labeledMeta
														
 
															+            #     tmp = LabeledIEDocument.objects.create(**_dict)
														
 
															+            #     tmp.save()
														
 
															+            #
														
 
															+            #     bratAnnotations = BratAnnotation.objects.filter(Q(document_id=obj.human_identifier))
														
 
															+            #     for ann in bratAnnotations:
														
 
															+            #         _dict_ann = object_to_dict(ann,BratAnnotation)
														
 
															+            #         labeledAnn = LabeledBratAnnotation.objects.create(**_dict_ann)
														
 
															+            #         labeledAnn.save()
														
 
															+
														
 
															+
														
 
															+    def getPercentOfPass(self,_user,time_begin,time_end):
														
 
															+        '''
														
 
															+        :param _user:用户名
														
 
															+        :param time_begin: 起始时间
														
 
															+        :param time_end: 截至时间
														
 
															+        :return: 获得用户在时间段内标注数据的合格率
														
 
															+        '''
														
 
															+
														
 
															+    def makePayrolls(self,time_begin,time_end):
														
 
															+        '''
														
 
															+        :param time_begin:起始时间
														
 
															+        :param time_end: 截至时间
														
 
															+        :return: 获得所有用户的工资表
														
 
															+        '''
														
 
															+        for _user in self.getAllUser():
														
 
															+            self.makePayroll(_user,time_begin,time_end)
														
 
															+        self.exportPayroll(time_begin,time_end)
														
 
															+
														
 
															+    def createUser_batch(self,batch_size=90):
														
 
															+        '''
														
 
															+        :param batch_size: 用户个数
														
 
															+        :return:
														
 
															+        '''
														
 
															+        list_user = [User.objects.create_user(username="bidi%d"%(i+1),password="bidi%d"%(i+1)) for i in range(batch_size)]
														
 
															+
														
 
															+    def exportLabels(self):
														
 
															+        groups = [[1,7],[8,14],[15,22],[23,29],[30,36],[37,43],[44,50],[51,56],[57,62],[63,71]]
														
 
															+        from django.db import connection
														
 
															+        cursor = connection.cursor()
														
 
															+        for _i in range(len(groups)):
														
 
															+            _begin,_end = groups[_i]
														
 
															+            list_username = []
														
 
															+            list_user = []
														
 
															+            list_label = []
														
 
															+            list_time = []
														
 
															+            for _j in range(_begin,_end+1):
														
 
															+                username = "bidi%d"%_j
														
 
															+                list_username.append("'%s'"%username)
														
 
															+            sql = " select edituser,human_identifier,to_char(edittime,'yyyy-mm-dd') from corpus_iedocument where edituser in(%s) order by edittime asc"%(",".join(list_username))
														
 
															+            print(sql)
														
 
															+            cursor.execute(sql)
														
 
															+            rows = cursor.fetchall()
														
 
															+            for row in rows:
														
 
															+                list_user.append(row[0])
														
 
															+                list_label.append(row[1])
														
 
															+                list_time.append(row[2])
														
 
															+            df = pd.DataFrame({"时间":list_time,"用户":list_user,"文章编号":list_label})
														
 
															+            df.to_excel("分组_%d.xls"%(_i+1),columns=["时间","用户","文章编号"])
														
 
															+
														
 
															+    def filter(self):
														
 
															+        '''
														
 
															+        过滤拍卖公告
														
 
															+        :return:
														
 
															+        '''
														
 
															+        import re
														
 
															+        ieDocuments = IEDocument.objects.all()
														
 
															+        for obj in ieDocuments:
														
 
															+            if re.search("拍卖",obj.text) is not None:
														
 
															+                obj.jump_signal = 1
														
 
															+                obj.save()
														
 
															+                print(obj.human_identifier)
														
 
															+
														
 
															+
														
 
															+
														
 
															+if __name__=="__main__":
														
 
															+    settle = Settlement()
														
 
															+    # settle.makeMigrate("test","2020-08-01","2020-08-31")
														
 
															+    settle.makePayroll(["test3","test19","test22","test2","test9","test11","test12","test1","test7","test21","test17"],"2020-08-01","2020-12-25")
														
 
															+    # settle.makePayrolls("2020-08-01","2020-08-31")
														
 
															+    settle.exportPayroll(begin_time=None,end_time='2020-12-25')
														
 
															+    # settle.createUser_batch(batch_size=102)
														
 
															+    # settle.exportLabels()
														
 
															+    # settle.filter()
														
--- a/examples/test/bin/分组_1.xls
+++ b/examples/test/bin/分组_1.xls
--- a/examples/test/bin/分组_10.xls
+++ b/examples/test/bin/分组_10.xls
--- a/examples/test/bin/分组_2.xls
+++ b/examples/test/bin/分组_2.xls
--- a/examples/test/bin/分组_3.xls
+++ b/examples/test/bin/分组_3.xls
--- a/examples/test/bin/分组_4.xls
+++ b/examples/test/bin/分组_4.xls
--- a/examples/test/bin/分组_5.xls
+++ b/examples/test/bin/分组_5.xls
--- a/examples/test/bin/分组_6.xls
+++ b/examples/test/bin/分组_6.xls
--- a/examples/test/bin/分组_7.xls
+++ b/examples/test/bin/分组_7.xls
	`@@ -0,0 +1,2 @@`
			`+# because of https://github.com/machinalis/iepy/issues/63`
			`+-e .`
	`@@ -0,0 +1,2 @@`
			`+# Write here your rules`
			`+# RELATION = 'your relation here'`