View Javadoc

1   /*
2    * NGrams.java created on Jan 4, 2009.
3    * Copyright 2009 All Eight, LLC
4    * 
5    * This file is part of textkit4j.
6    * 
7    * textkit4j is free software: you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation, either version 3 of the License, or
10   * (at your option) any later version.
11   * 
12   * textkit4j is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   * 
17   * You should have received a copy of the GNU General Public License
18   * along with textkit4j.  If not, see <http://www.gnu.org/licenses/>.
19   */
20  package net.sf.textkit4j.matching;
21  
22  import java.io.Serializable;
23  import java.util.HashMap;
24  import java.util.Map;
25  import java.util.Set;
26  
27  /**
28   * (Plagarized from Wikipedia) An n-gram is a sub-sequence of n items from a
29   * given sequence. n-grams are used in various areas of statistical natural
30   * language processing and genetic sequence analysis. The NGrams class is a
31   * map/table of counts keyed by n-gram sequences of characters or words. The
32   * idea is this can be a handy model for classification/clustering/grouping
33   * applications.
34   * 
35   * @author rich
36   */
37  @SuppressWarnings("serial")
38  public class NGrams implements Serializable
39  {
40  
41      private Map<CharSequence, Integer> grams = new HashMap<CharSequence, Integer>();
42  
43      private Double length = 0.0d;
44  
45      NGrams(Map<CharSequence, Integer> grams)
46      {
47          this.grams = grams;
48          this.length = calculateVectorLength();
49      }
50  
51      /*
52       * @return The length of this ngram Map as a Vector length.
53       */
54      Double calculateVectorLength()
55      {
56          Double total = 0.0d;
57          for (CharSequence key : this.grams.keySet())
58              total += Math.pow(this.grams.get(key), 2);
59          return Math.sqrt(total);
60      }
61  
62      /*
63       * Get's all the grams (wihout counts) from this.
64       */
65      Set<CharSequence> getNGrams()
66      {
67          return this.grams.keySet();
68      }
69  
70      /**
71       * Get the count value for a particular gram. If the gram isn't present then
72       * 0 is returned.
73       */
74      Integer get(CharSequence gram)
75      {
76          return this.grams.get(gram) != null ? this.grams.get(gram) : 0;
77      }
78  
79      /**
80       * Get the vector length of this NGrams.
81       */
82      Double length()
83      {
84          return this.length;
85      }
86  
87      /**
88       * Calculates the similarity of this NGrams with the other NGrams as the
89       * cosine of the angle between the two NGrams vector representation.
90       */
91      public Double similiarity(NGrams other)
92      {
93          Double total = 0.0d;
94          Set<CharSequence> otherTrigramsKeySet = other.getNGrams();
95  
96          for (CharSequence key : this.grams.keySet())
97              if (otherTrigramsKeySet.contains(key))
98                  total += this.grams.get(key) * other.get(key);
99  
100         return total / (this.length() * other.length());
101     }
102 
103 }