"""
Part of the tagit module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2016
"""
# IMPORTS
import Levenshtein as ls
from os.path import join, dirname
# INNER-MODULE IMPORTS
#from external import Spellcheck
# CONSTANTS
# EXPORTS
__all__ = ('tags_similarity', 'stat_per_tag', 'stat_per_image')
## CODE ##
[docs]def tags_similarity(tags):
"""
* Similar case
* Similar word (typo: common errors, change of two letters)
* Synonym
* Prefix (one word is subset of the other)
* Similar results
* One is a subset of the other (w.r.t images)
"""
similar = []
#lang = 'de'
#wordlist = 'words_{0}.t'.format(lang)
#known_words = file(join(dirname(__file__), 'resources', 'dict', wordlist)).read().split()
#spell = Spellcheck(lang)
for needle in tags:
sim = []
splits = [(needle[:i], needle[i:]) for i in range(len(needle) + 1)]
transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
for tag in tags:
if tag == needle: continue # Ignore needle
# Similar case
if tag.lower() == needle.lower():
sim.append(tag)
continue
# Substring
if tag.find(needle) > -1 or needle.find(tag) > -1:
sim.append(tag)
continue
# Similar word: Edit distance
if ls.distance(needle, tag) <= 1:
sim.append(tag)
continue
# Similar word: Change of two letters
# see 3rdparty/spellcheck.py
if tag in transposes:
sim.append(tag)
continue
# Similar word: Spellcheck
#if needle not in known_words:
# sim.append(spell.correct(needle))
similar.append(sim)
return similar
[docs]def stat_per_tag(model):
import numpy as np
images_total = model.num_images()
n_tags = np.array(model.tags.histogram().values())
return (
len(n_tags)
, np.min(n_tags)
, np.max(n_tags)
, np.mean(n_tags)
, np.std(n_tags)
, np.std(n_tags) ** 2
, np.percentile(n_tags, 25)
, np.median(n_tags)
, np.percentile(n_tags, 75)
, (n_tags == images_total).sum()
, (n_tags <= 1).sum()
)
[docs]def stat_per_image(model):
import numpy as np
tags_total = len(model.tags.get_all())
n_tags = np.array([len(model.tags.get(im)) for im in model.query()])
return (
len(n_tags)
, np.min(n_tags)
, np.max(n_tags)
, np.mean(n_tags)
, np.std(n_tags)
, np.std(n_tags) ** 2
, np.percentile(n_tags, 25)
, np.median(n_tags)
, np.percentile(n_tags, 75)
, (n_tags == tags_total).sum()
, (n_tags <= 5).sum()
)
## EOF ##