"""
Part of the tagit module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2016
"""
# IMPORTS
import operator
from math import log
# INNER-MODULE IMPORTS
from ...basics import difference, fst, snd, split
from tags import CSidebox_Tags
from tagit.external.memoize import memoized
from ...token import Token_Tag
# EXPORTS
__all__ = ('CSidebox_Tags_Suggested', )
## CODE ##
log2 = lambda x: log(x) / log(2.0)
class Score(object):
def __init__(self, model, results, filtered, specials=None):
if specials is None: specials = []
self._specials = specials
self._results = results
self._model = model
self._filtered = filtered
self._constant = 1.0
# Active score functions
self.scores = [
#(1.0, self.split_ratio)
(1.0, self.evenness)
#(1.0, self.entropy)
#(1.0, self.self_information)
#(1.0, self.mutual_information)
#(1.0, self.specificality)
]
if len(self._results) == 0 or self.images() == 0:
self.scores = [(1.0, self.constant)]
def get_score(self, tag):
"""Compute the weighted score for *tag*.
"""
weights, parts = zip(*(self.scores))
scores = map(lambda fu: fu(tag), parts) # Run scores
return sum(map(operator.mul, weights, scores)) # Weighted sum
###########################################################################
# SCORING METHODS #
###########################################################################
@memoized
def images(self):
return 1.0 * self._model.num_images()
@memoized
def tags(self):
return 1.0 * len(self._model.tags.get_all())
@memoized
def associations(self):
raise NotImplementedError()
@memoized
def results(self):
return 1.0 * len(self._results)
@memoized
def tags_hist(self):
return self._model.tags.histogram(include=self._results)
@memoized
def results_with_tag(self, tag):
hist = self.tags_hist()
return 1.0 * hist[tag]
@memoized
def results_without_tag(self, tag):
return self.results() - self.results_with_tag(tag)
@memoized
def images_with_tag(self, tag):
return 1.0 * len(self._model.query([Token_Tag(tag)]))
@memoized
def images_without_tag(self, tag):
return self.images() - self.images_with_tag(tag)
@memoized
def tags_with_image(self, image):
raise NotImplementedError()
@memoized
def tags_without_image(self, image):
return self.tags() - self.tags_with_image(image)
def constant(self, tag):
return self._constant
@memoized
def split_ratio(self, tag):
"""# images with tag / # current results = p(x | y)"""
return self.results_with_tag(tag) / self.results()
@memoized
def evenness(self, tag):
"""2.0 * abs(0.5 - split ratio)"""
r = 1.0 - 2.0 * abs(0.5 - self.split_ratio(tag))
return r
@memoized
def entropy(self, tag):
r_with = self.results_with_tag(tag) / self.results()
r_with = r_with > 0 and r_with * log2(r_with) or 0.0
r_without = self.results_without_tag(tag) / self.results()
r_without = r_without > 0 and r_without * log2(r_without) or 0.0
return - (r_with + r_without)
@memoized
def self_information(self, tag):
"""Id(x) = -log p(x) -log (# images w/(w/o) tag x / # images)"""
return -1.0 * log2(self.images_with_tag(tag) / self.images())
@memoized
def mutual_information(self, tag):
r_with = self.results_with_tag(tag) / self.images()
r_with = r_with * log2( r_with / (self.images_with_tag(tag) * self.results()) )
r_without = self.results_without_tag(tag) / self.images()
if r_without > 0.0:
# images_without_tag() == 0 -> r_without == 0 -> set 0!
r_without = r_without * log2( r_without / (self.images_without_tag(tag) * self.results()) )
return r_with + r_without
def specificality(self, tag):
return self.images_with_tag(tag) / self.results_with_tag(tag)
"""
restrictiveness/specificality:
# images within current results / # images with tag in total
tag frequency:
# images with tag assigned / # total assignments
# images with tag assigned / # images
# images with tag assigned / # tags
# results
# images with tag
# images without tag
# images
# tags
# total assignments
# current results w/ tag
# current results w/o tag
the results is the intersection of sets
event x
occurrence of tag x in an image
event !x
1 - event x (absence of tag x in an image)
probability
p(x) = # occurrences of (not) tag x / # total assignments
= # images w/ (w/o) tag / # total assignments
conditional probability
p(x|y) = p(x, y) / p(y) (Bayes)
p(x, y) = # occurrences of tag x and y / # total images
= # occurrences of x in current results / # total images
p(y) = # occurrences of y / # total images
= # current results / # total images
p(x|y) = # occurrences of tag x and y / # occurrences of y
p(x|y) = (# occurrences of x in current results / # total images ) / (# current results / # total images)
= # occurrences of x in current results / # current results
p(x|y) = # current results w/ tag / # results
self-information
Id(x) = - log p(x)
Id(x) = - log (# images w/(w/o) tag x / # images)
conditional self-information
Id(x|y) = - log p(x|y)
entropy
H(X | Y=y) = - sum_x p(x|y) * log p(x|y)
sum goes over has(x) and !has(x)
pseudo mutual information
I(X; Y=y) = sum_x p(x, y) log p(x, y) / (p(x) * p(y))
sum goes over has(x) and !has(x)
split ratio
# images with tag / # current results
p(x|y)
evenness
2.0 * abs(0.5 - split ratio)
example:
tags: A, B, C, D
images: 1, 2, 3
assignments: (A, 1), (A, 2), (A, 3), (B, 1), (B, 2), (C, 1)
Probability of an image having tag x:
p(x) = # images with tag x / # images
x not x sum
A 3 / 3 = 1 0 / 3 = 0 1
B 2 / 3 1 / 3 1
C 1 / 3 2 / 3 1
D 0 / 3 = 0 3 / 3 = 1 1
sum 6 / 3 = 2 6 / 3 = 2 4
A, B 2 / 3
A, C 1 / 3
A, D 0 / 3
B, C 1 / 3
B, D 0 / 3
C, D 0 / 3
Probability of an assignment being tag x:
p(x) = # assignments of tag x / # total assignments
x not x sum
A 3 / 6 = 1 / 2 3 / 6 = 1 / 2 1
B 2 / 6 = 1 / 3 4 / 6 = 2 / 3 1
C 1 / 6 5 / 6 1
D 0 / 6 6 / 6 = 1 1
sum 6 / 6 = 1 18 / 6 = 3 4
Probability of a tag including image x:
Functions
---------
* images Total number of images
* tags Total number of tags
* associations Total number of associations
* results Total number of current results
* results_with_tag Number of current results with tag x
* images_with_tag Number of images with tag x
* tags_with_image Number of tags of image x
Derived functions:
* results_without_tag Number of current results without tag x
results - results_with_tag
* images_without_tag Number of images without tag x
images - images_with_tag
* tags_without_image Number of tags with no association to image x
tags - tags_with_image
Score functions:
* split_ratio
* evenness
* entropy
* self_information
* cond_self_information
* mutual_information
"""
## EOF ##