Source code for tagit.metrics

"""

Part of the tagit module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2016

"""
# IMPORTS
import Levenshtein as ls
from os.path import join, dirname

# INNER-MODULE IMPORTS
#from external import Spellcheck

# CONSTANTS

# EXPORTS
__all__ = ('tags_similarity', 'stat_per_tag', 'stat_per_image')

## CODE ##

[docs]def tags_similarity(tags): """ * Similar case * Similar word (typo: common errors, change of two letters) * Synonym * Prefix (one word is subset of the other) * Similar results * One is a subset of the other (w.r.t images) """ similar = [] #lang = 'de' #wordlist = 'words_{0}.t'.format(lang) #known_words = file(join(dirname(__file__), 'resources', 'dict', wordlist)).read().split() #spell = Spellcheck(lang) for needle in tags: sim = [] splits = [(needle[:i], needle[i:]) for i in range(len(needle) + 1)] transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1] for tag in tags: if tag == needle: continue # Ignore needle # Similar case if tag.lower() == needle.lower(): sim.append(tag) continue # Substring if tag.find(needle) > -1 or needle.find(tag) > -1: sim.append(tag) continue # Similar word: Edit distance if ls.distance(needle, tag) <= 1: sim.append(tag) continue # Similar word: Change of two letters # see 3rdparty/spellcheck.py if tag in transposes: sim.append(tag) continue # Similar word: Spellcheck #if needle not in known_words: # sim.append(spell.correct(needle)) similar.append(sim) return similar
[docs]def stat_per_tag(model): import numpy as np images_total = model.num_images() n_tags = np.array(model.tags.histogram().values()) return ( len(n_tags) , np.min(n_tags) , np.max(n_tags) , np.mean(n_tags) , np.std(n_tags) , np.std(n_tags) ** 2 , np.percentile(n_tags, 25) , np.median(n_tags) , np.percentile(n_tags, 75) , (n_tags == images_total).sum() , (n_tags <= 1).sum() )
[docs]def stat_per_image(model): import numpy as np tags_total = len(model.tags.get_all()) n_tags = np.array([len(model.tags.get(im)) for im in model.query()]) return ( len(n_tags) , np.min(n_tags) , np.max(n_tags) , np.mean(n_tags) , np.std(n_tags) , np.std(n_tags) ** 2 , np.percentile(n_tags, 25) , np.median(n_tags) , np.percentile(n_tags, 75) , (n_tags == tags_total).sum() , (n_tags <= 5).sum() ) ## EOF ##