Source code for pycrypt.scorers.languagescorer

import scorer
from .. import utils
from unidecode import unidecode
# import cgetngramfrequencies

[docs]class LanguageScorer(scorer.Scorer): """Scorer for languages based on N-grams and words""" words = None minWordLen = 3 maxWordLen = 10 log = False ngramWeights = None wordWeight = 0 unidec = True
[docs] def setIdealNgramFrequencies(self, freqs): self.idealNgramFrequencies = freqs self.idealNgramsKeySets = [set(i.keys()) for i in freqs] self.ngramLens = [len(i.keys()[0]) for i in freqs] if (self.ngramWeights == None): self.ngramWeights = [1] * len(freqs)
[docs] def loadWordList(self, path, minwordlen = 3, maxwordlen = 10): """Load words from file, 1 word per line""" self.minWordLen = minwordlen self.maxWordLen = maxwordlen self.words = set([line.strip().upper() for line in open(path)])
[docs] def setWeights(self, ngram_weights, word_weight = 0): """Score multipliers, ngram_weights is list corresponding to ideal frequencies when something is 0, it's ignored when scoring""" self.ngramWeights = ngram_weights self.wordWeight = word_weight
[docs] def getNgramFrequencies(self, text, length): """Get dictionary of frequencies of N-grams (of given length)""" # return cgetngramfrequencies.getNgramFrequencies(text, length) d = {} for i in range(len(text) + 1 - length): sub = text[i:i+length] if (d.has_key(sub)): d[sub] += 1 else: d[sub] = 1 # for i in d: # d[i] /= len(text) return d
[docs] def scoreNgrams(self, text): scores = [] for i, ideal_freq in enumerate(self.idealNgramFrequencies): scores.append(0.0) if (self.ngramWeights[i]): # 0 is to ignore text_freq = self.getNgramFrequencies(text, self.ngramLens[i]) for ngram in list(self.idealNgramsKeySets[i] & set(text_freq.keys())): # get only mutual ngrams scores[i] += ideal_freq[ngram] * (text_freq[ngram] / float(len(text))) # weird equation, but it works return scores
[docs] def scoreWords(self, text): if (self.maxWordLen == 0 or self.words == None): return 0 s = text pts = 0.0 for length in range(self.minWordLen, self.maxWordLen): for pos in range(len(s) - 1 - length): if (s[pos:pos+length] in self.words): pts += length pts /= len(s) return (pts ** 2.0) * 0.8
@utils.cache def score(self, text): if (self.unidec): text = unidecode(unicode(text)).upper() else: text = text.upper() ngrams_scores = [i * j for i, j in zip(self.ngramWeights, self.scoreNgrams(text))] word_score = self.scoreWords(text) * self.wordWeight final_score = sum(ngrams_scores) + word_score if (self.log): print([ngrams_scores, word_score], "Total:", final_score) return final_score