123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553 |
- #!/usr/bin/env python
- # -*- Mode: Python; tab-width: 4; indent-tabs-mode: nil; coding: utf-8; -*-
- # vim:set ft=python ts=4 sw=4 sts=4 autoindent:
- """Normalization support."""
- from datetime import datetime
- from functools import reduce
- import normdb
- import sdistance
- import simstringdb
- from document import real_directory
- from message import Messager
- from normdb import string_norm_form
- from projectconfig import ProjectConfiguration
- # whether to display alignment scores in search result table
- DISPLAY_SEARCH_SCORES = False
- # maximum alignment score (tsuruoka_local)
- MAX_SCORE = 1000
- # maximum alignment score (tsuruoka_local) difference allowed between
- # the score for a string s and the best known score before excluding s
- # from consideration
- MAX_DIFF_TO_BEST_SCORE = 200
- # maximum number of search results to return
- MAX_SEARCH_RESULT_NUMBER = 1000
- NORM_LOOKUP_DEBUG = True
- REPORT_LOOKUP_TIMINGS = False
- # debugging
- def _check_DB_version(database):
- # TODO; not implemented yet for new-style SQL DBs.
- pass
- def _report_timings(dbname, start, msg=None):
- delta = datetime.now() - start
- strdelta = str(delta).replace('0:00:0', '') # take out zero min & hour
- queries = normdb.get_query_count(dbname)
- normdb.reset_query_count(dbname)
- Messager.info("Processed " + str(queries) + " queries in " + strdelta +
- (msg if msg is not None else ""))
- def _get_db_path(database, collection):
- if collection is None:
- # TODO: default to WORK_DIR config?
- return None
- else:
- try:
- conf_dir = real_directory(collection)
- projectconf = ProjectConfiguration(conf_dir)
- norm_conf = projectconf.get_normalization_config()
- for entry in norm_conf:
- dbname, dbpath = entry[0], entry[3]
- if dbname == database:
- return dbpath
- # not found in config.
- Messager.warning('DB ' + database + ' not defined in config for ' +
- collection + ', falling back on default.')
- return None
- except Exception:
- # whatever goes wrong, just warn and fall back on the default.
- Messager.warning('Failed to get DB path from config for ' +
- collection + ', falling back on default.')
- return None
- def norm_get_name(database, key, collection=None):
- if NORM_LOOKUP_DEBUG:
- _check_DB_version(database)
- if REPORT_LOOKUP_TIMINGS:
- lookup_start = datetime.now()
- dbpath = _get_db_path(database, collection)
- if dbpath is None:
- # full path not configured, fall back on name as default
- dbpath = database
- try:
- data = normdb.data_by_id(dbpath, key)
- except normdb.dbNotFoundError as e:
- Messager.warning(str(e))
- data = None
- # just grab the first one (sorry, this is a bit opaque)
- if data is not None:
- value = data[0][0][1]
- else:
- value = None
- if REPORT_LOOKUP_TIMINGS:
- _report_timings(database, lookup_start)
- # echo request for sync
- json_dic = {
- 'database': database,
- 'key': key,
- 'value': value
- }
- return json_dic
- def norm_get_data(database, key, collection=None):
- if NORM_LOOKUP_DEBUG:
- _check_DB_version(database)
- if REPORT_LOOKUP_TIMINGS:
- lookup_start = datetime.now()
- dbpath = _get_db_path(database, collection)
- if dbpath is None:
- # full path not configured, fall back on name as default
- dbpath = database
- try:
- data = normdb.data_by_id(dbpath, key)
- except normdb.dbNotFoundError as e:
- Messager.warning(str(e))
- data = None
- if data is None:
- Messager.warning("Failed to get data for " + database + ":" + key)
- if REPORT_LOOKUP_TIMINGS:
- _report_timings(database, lookup_start)
- # echo request for sync
- json_dic = {
- 'database': database,
- 'key': key,
- 'value': data
- }
- return json_dic
- # TODO: deprecated, confirm unnecessary and remove.
- # def norm_get_ids(database, name, collection=None):
- # if NORM_LOOKUP_DEBUG:
- # _check_DB_version(database)
- # if REPORT_LOOKUP_TIMINGS:
- # lookup_start = datetime.now()
- #
- # dbpath = _get_db_path(database, collection)
- # if dbpath is None:
- # # full path not configured, fall back on name as default
- # dbpath = database
- #
- # keys = normdb.ids_by_name(dbpath, name)
- #
- # if REPORT_LOOKUP_TIMINGS:
- # _report_timings(database, lookup_start)
- #
- # # echo request for sync
- # json_dic = {
- # 'database' : database,
- # 'value' : name,
- # 'keys' : keys,
- # }
- # return json_dic
- def _format_datas(datas, scores=None, matched=None):
- # helper for norm_search(), formats data from DB into a table
- # for client, sort by scores if given.
- if scores is None:
- scores = {}
- if matched is None:
- matched = {}
- # chop off all but the first two groups of label:value pairs for
- # each key; latter ones are assumed to be additional information
- # not intended for display of search results.
- # TODO: avoid the unnecessary queries for this information.
- cropped = {}
- for key in datas:
- cropped[key] = datas[key][:2]
- datas = cropped
- # organize into a table format with separate header and data
- # (this matches the collection browser data format)
- unique_labels = []
- seen_label = {}
- for key in datas:
- # check for dups within each entry
- seen_label_for_key = {}
- for i, group in enumerate(datas[key]):
- for label, value in group:
- if label not in seen_label:
- # store with group index to sort all labels by
- # group idx first
- unique_labels.append((i, label))
- seen_label[label] = True
- if label in seen_label_for_key:
- # too noisy, and not really harmful now that matching
- # values are preferred for repeated labels.
- # Messager.warning("Repeated label (%s) in normalization data not supported" % label)
- pass
- seen_label_for_key[label] = True
- # sort unique labels by group index (should be otherwise stable,
- # holds since python 2.3), and flatten
- unique_labels.sort(key=lambda a: a[0])
- unique_labels = [a[1] for a in unique_labels]
- # ID is first field, and datatype is "string" for all labels
- header = [(label, "string") for label in ["ID"] + unique_labels]
- if DISPLAY_SEARCH_SCORES:
- header += [("score", "int")]
- # construct items, sorted by score first, ID second (latter for stability)
- sorted_keys = sorted(list(datas.keys()), key=lambda a: (scores.get(a, 0), a), reverse=True)
- items = []
- for key in sorted_keys:
- # make dict for lookup. In case of duplicates (e.g. multiple
- # "synonym" entries), prefer ones that were matched.
- # TODO: prefer more exact matches when multiple found.
- data_dict = {}
- for group in datas[key]:
- for label, value in group:
- if label not in data_dict or (value in matched and
- data_dict[label] not in matched):
- data_dict[label] = value
- # construct item
- item = [str(key)]
- for label in unique_labels:
- if label in data_dict:
- item.append(data_dict[label])
- else:
- item.append('')
- if DISPLAY_SEARCH_SCORES:
- item += [str(scores.get(key))]
- items.append(item)
- return header, items
- def _norm_filter_score(score, best_score=MAX_SCORE):
- return score < best_score - MAX_DIFF_TO_BEST_SCORE
- # TODO: get rid of arbitrary max_cost default constant
- def _norm_score(substring, name, max_cost=500):
- # returns an integer score representing the similarity of the given
- # substring to the given name (larger is better).
- cache = _norm_score.__cache
- if (substring, name) not in cache:
- cost = sdistance.tsuruoka_local(substring, name, max_cost=max_cost)
- # debugging
- #Messager.info('%s --- %s: %d (max %d)' % (substring, name, cost, max_cost))
- score = MAX_SCORE - cost
- cache[(substring, name)] = score
- # TODO: should we avoid exceeding max_cost? Cached values might.
- return cache[(substring, name)]
- _norm_score.__cache = {}
- def _norm_search_name_attr(database, name, attr,
- matched, score_by_id, score_by_str,
- best_score=0, exactmatch=False,
- threshold=simstringdb.DEFAULT_THRESHOLD):
- # helper for norm_search, searches for matches where given name
- # appears either in full or as an approximate substring of a full
- # name (if exactmatch is False) in given DB. If attr is not None,
- # requires its value to appear as an attribute of the entry with
- # the matched name. Updates matched, score_by_id, and
- # score_by_str, returns best_score.
- # If there are no strict substring matches for a given attribute
- # in the simstring DB, we can be sure that no query can succeed,
- # and can fail early.
- # TODO: this would be more effective (as would some other things)
- # if the attributes were in a separate simstring DB from the
- # names.
- if attr is not None:
- normattr = string_norm_form(attr)
- if not simstringdb.ssdb_supstring_exists(normattr, database, 1.0):
- # debugging
- #Messager.info('Early norm search fail on "%s"' % attr)
- return best_score
- if exactmatch:
- # only candidate string is given name
- strs = [name]
- ss_norm_score = {string_norm_form(name): 1.0}
- else:
- # expand to substrings using simstring
- # simstring requires UTF-8
- normname = string_norm_form(name)
- str_scores = simstringdb.ssdb_supstring_lookup(normname, database,
- threshold, True)
- strs = [s[0] for s in str_scores]
- ss_norm_score = dict(str_scores)
- # TODO: recreate this older filter; watch out for which name to use!
- # # filter to strings not already considered
- # strs = [s for s in strs if (normname, s) not in score_by_str]
- # look up IDs
- if attr is None:
- id_names = normdb.ids_by_names(database, strs, False, True)
- else:
- id_names = normdb.ids_by_names_attr(database, strs, attr, False, True)
- # sort by simstring (n-gram overlap) score to prioritize likely
- # good hits.
- # TODO: this doesn't seem to be having a very significant effect.
- # consider removing as unnecessary complication (ss_norm_score also).
- id_name_scores = [(i, n, ss_norm_score[string_norm_form(n)])
- for i, n in id_names]
- id_name_scores.sort(key=lambda a: a[2], reverse=True)
- id_names = [(i, n) for i, n, s in id_name_scores]
- # update matches and scores
- for i, n in id_names:
- if n not in matched:
- matched[n] = set()
- matched[n].add(i)
- max_cost = MAX_SCORE - best_score + MAX_DIFF_TO_BEST_SCORE + 1
- if (name, n) not in score_by_str:
- # TODO: decide whether to use normalized or unnormalized strings
- # for scoring here.
- #score_by_str[(name, n)] = _norm_score(name, n, max_cost)
- score_by_str[(name, n)] = _norm_score(
- string_norm_form(name), string_norm_form(n), max_cost)
- score = score_by_str[(name, n)]
- best_score = max(score, best_score)
- score_by_id[i] = max(score_by_id.get(i, -1),
- score_by_str[(name, n)])
- # stop if max count reached
- if len(score_by_id) > MAX_SEARCH_RESULT_NUMBER:
- Messager.info(
- 'Note: more than %d search results, only retrieving top matches' %
- MAX_SEARCH_RESULT_NUMBER)
- break
- return best_score
- def _norm_search_impl(database, name, collection=None, exactmatch=False):
- if NORM_LOOKUP_DEBUG:
- _check_DB_version(database)
- if REPORT_LOOKUP_TIMINGS:
- lookup_start = datetime.now()
- dbpath = _get_db_path(database, collection)
- if dbpath is None:
- # full path not configured, fall back on name as default
- dbpath = database
- # maintain map from searched names to matching IDs and scores for
- # ranking
- matched = {}
- score_by_id = {}
- score_by_str = {}
- # look up hits where name appears in full
- best_score = _norm_search_name_attr(dbpath, name, None,
- matched, score_by_id, score_by_str,
- 0, exactmatch)
- # if there are no hits and we only have a simple candidate string,
- # look up with a low threshold
- if best_score == 0 and len(name.split()) == 1:
- best_score = _norm_search_name_attr(dbpath, name, None,
- matched, score_by_id, score_by_str,
- 0, exactmatch, 0.5)
- # if there are no good hits, also consider only part of the input
- # as name and the rest as an attribute.
- # TODO: reconsider arbitrary cutoff
- if best_score < 900 and not exactmatch:
- parts = name.split()
- # prioritize having the attribute after the name
- for i in range(len(parts) - 1, 0, -1):
- # TODO: this early termination is sub-optimal: it's not
- # possible to know in advance which way of splitting the
- # query into parts yields best results. Reconsider.
- if len(score_by_id) > MAX_SEARCH_RESULT_NUMBER:
- break
- start = ' '.join(parts[:i])
- end = ' '.join(parts[i:])
- # query both ways (start is name, end is attr and vice versa)
- best_score = _norm_search_name_attr(dbpath, start, end,
- matched, score_by_id,
- score_by_str,
- best_score, exactmatch)
- best_score = _norm_search_name_attr(dbpath, end, start,
- matched, score_by_id,
- score_by_str,
- best_score, exactmatch)
- # flatten to single set of IDs
- ids = reduce(set.union, list(matched.values()), set())
- # filter ids that now (after all queries complete) fail
- # TODO: are we sure that this is a good idea?
- ids = set([i for i in ids
- if not _norm_filter_score(score_by_id[i], best_score)])
- # TODO: avoid unnecessary queries: datas_by_ids queries for names,
- # attributes and infos, but _format_datas only uses the first two.
- datas = normdb.datas_by_ids(dbpath, ids)
- header, items = _format_datas(datas, score_by_id, matched)
- if REPORT_LOOKUP_TIMINGS:
- _report_timings(database, lookup_start,
- ", retrieved " + str(len(items)) + " items")
- # echo request for sync
- json_dic = {
- 'database': database,
- 'query': name,
- 'header': header,
- 'items': items,
- }
- return json_dic
- def norm_search(database, name, collection=None, exactmatch=False):
- try:
- return _norm_search_impl(database, name, collection, exactmatch)
- except simstringdb.ssdbNotFoundError as e:
- Messager.warning(str(e))
- return {
- 'database': database,
- 'query': name,
- 'header': [],
- 'items': []
- }
- def _test():
- # test
- test_cases = {
- 'UniProt': {
- 'Runx3': 'Q64131',
- 'Runx3 mouse': 'Q64131',
- 'Runx1': 'Q03347',
- 'Runx1 mouse': 'Q03347',
- 'Eomes': 'O54839',
- 'Eomes mouse': 'O54839',
- 'granzyme B': 'P04187',
- 'granzyme B mouse': 'P04187',
- 'INF-gamma': 'P01580',
- 'INF-gamma mouse': 'P01580',
- 'IL-2': 'P04351',
- 'IL-2 mouse': 'P04351',
- 'T-bet': 'Q9JKD8',
- 'T-bet mouse': 'Q9JKD8',
- 'GATA-1': 'P15976',
- 'GATA-1 human': 'P15976',
- 'Interleukin-10': 'P22301',
- 'Interleukin-10 human': 'P22301',
- 'Interleukin-12': 'P29459',
- 'Interleukin-12 human': 'P29459',
- 'interferon-gamma': 'P01579',
- 'interferon-gamma human': 'P01579',
- 'interferon gamma human': 'P01579',
- 'Fas ligand': 'P48023',
- 'Fas ligand human': 'P48023',
- 'IkappaB-alpha': 'P25963',
- 'IkappaB-alpha human': 'P25963',
- 'transforming growth factor (TGF)-beta1': 'P01137',
- 'transforming growth factor (TGF)-beta1 human': 'P01137',
- 'transforming growth factor beta1 human': 'P01137',
- 'tumor necrosis factor alpha': 'P01375',
- 'tumor necrosis factor alpha human': 'P01375',
- 'Epstein-Barr virus latent membrane protein LMP1': 'Q1HVB3',
- 'TATA box binding protein': 'P20226',
- 'TATA box binding protein human': 'P20226',
- 'HIV protease': '??????', # TODO
- # TODO
- 'human immunodeficiency virus type 1 (HIV) protease': '??????',
- }
- }
- overall_start = datetime.now()
- query_count, hit_count = 0, 0
- misses = []
- for DB in test_cases:
- for query in test_cases[DB]:
- target = test_cases[DB][query]
- start = datetime.now()
- results = norm_search(DB, query)
- delta = datetime.now() - start
- found = False
- found_rank = -1
- for rank, item in enumerate(results['items']):
- id_ = item[0]
- if id_ == target:
- found = True
- found_rank = rank + 1
- break
- strdelta = str(delta).replace('0:00:0', '').replace('0:00:', '')
- print("%s: '%s' <- '%s' rank %d/%d (%s sec)" % (' ok' if found
- else 'MISS',
- target, query,
- found_rank,
- len(results['items']),
- strdelta))
- query_count += 1
- if found:
- hit_count += 1
- else:
- misses.append((query, target))
- if len(misses) != 0:
- print()
- print("MISSED:")
- for query, target in misses:
- print("%s '%s'" % (target, query))
- delta = datetime.now() - overall_start
- strdelta = str(delta).replace('0:00:0', '').replace('0:00:', '')
- print()
- print("Found %d / %d in %s" % (hit_count, query_count, strdelta))
- def _profile_test():
- # runs _test() with profiling, storing results in "norm.profile".
- # To see a profile, run e.g.
- # python -c 'import pstats;
- # pstats.Stats("norm.profile").strip_dirs().sort_stats("time").print_stats()'
- # | less
- import cProfile
- cProfile.run('_test()', 'norm.profile')
- if __name__ == '__main__':
- _test() # normal
- # _profile_test() # profiled
|