1 /*
2 * NGrams.java created on Jan 4, 2009.
3 * Copyright 2009 All Eight, LLC
4 *
5 * This file is part of textkit4j.
6 *
7 * textkit4j is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * textkit4j is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with textkit4j. If not, see <http://www.gnu.org/licenses/>.
19 */
20 package net.sf.textkit4j.matching;
21
22 import java.io.Serializable;
23 import java.util.HashMap;
24 import java.util.Map;
25 import java.util.Set;
26
27 /**
28 * (Plagarized from Wikipedia) An n-gram is a sub-sequence of n items from a
29 * given sequence. n-grams are used in various areas of statistical natural
30 * language processing and genetic sequence analysis. The NGrams class is a
31 * map/table of counts keyed by n-gram sequences of characters or words. The
32 * idea is this can be a handy model for classification/clustering/grouping
33 * applications.
34 *
35 * @author rich
36 */
37 @SuppressWarnings("serial")
38 public class NGrams implements Serializable
39 {
40
41 private Map<CharSequence, Integer> grams = new HashMap<CharSequence, Integer>();
42
43 private Double length = 0.0d;
44
45 NGrams(Map<CharSequence, Integer> grams)
46 {
47 this.grams = grams;
48 this.length = calculateVectorLength();
49 }
50
51 /*
52 * @return The length of this ngram Map as a Vector length.
53 */
54 Double calculateVectorLength()
55 {
56 Double total = 0.0d;
57 for (CharSequence key : this.grams.keySet())
58 total += Math.pow(this.grams.get(key), 2);
59 return Math.sqrt(total);
60 }
61
62 /*
63 * Get's all the grams (wihout counts) from this.
64 */
65 Set<CharSequence> getNGrams()
66 {
67 return this.grams.keySet();
68 }
69
70 /**
71 * Get the count value for a particular gram. If the gram isn't present then
72 * 0 is returned.
73 */
74 Integer get(CharSequence gram)
75 {
76 return this.grams.get(gram) != null ? this.grams.get(gram) : 0;
77 }
78
79 /**
80 * Get the vector length of this NGrams.
81 */
82 Double length()
83 {
84 return this.length;
85 }
86
87 /**
88 * Calculates the similarity of this NGrams with the other NGrams as the
89 * cosine of the angle between the two NGrams vector representation.
90 */
91 public Double similiarity(NGrams other)
92 {
93 Double total = 0.0d;
94 Set<CharSequence> otherTrigramsKeySet = other.getNGrams();
95
96 for (CharSequence key : this.grams.keySet())
97 if (otherTrigramsKeySet.contains(key))
98 total += this.grams.get(key) * other.get(key);
99
100 return total / (this.length() * other.length());
101 }
102
103 }