View Javadoc

1   /*
2    * NGramFactory.java created on Jan 4, 2009.
3    * Copyright 2009 All Eight, LLC
4    * 
5    * This file is part of textkit4j.
6    * 
7    * textkit4j is free software: you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation, either version 3 of the License, or
10   * (at your option) any later version.
11   * 
12   * textkit4j is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   * 
17   * You should have received a copy of the GNU General Public License
18   * along with textkit4j.  If not, see <http://www.gnu.org/licenses/>.
19   */
20  package net.sf.textkit4j.matching;
21  
22  /**
23   * Generates a map of n-grams and their count in the supplied text. The supplied
24   * text can be lower-cased, white-space can be collapsed into a single
25   * white-space, and punctuation can be filtered out, depending on how this is
26   * configured.
27   * 
28   * @author rich
29   */
30  public abstract class NGramFactory
31  {
32      boolean collapseWhiteSpace = true; // sensible default?
33  
34      boolean stripPunctuation = true; // sensible default?
35  
36      boolean lowerCase = true; // sensible default?
37  
38      /**
39       * Generates new n-grams.
40       * 
41       * @param text
42       * @param n
43       * @return
44       */
45      abstract NGrams newNGrams(String text, int n);
46  
47      /**
48       * 
49       * @param text
50       * @return
51       */
52      public NGrams unigrams(String text)
53      {
54          return newNGrams(preProcess(text), 1);
55      }
56  
57      /**
58       * 
59       * @param text
60       * @return
61       */
62      public NGrams bigrams(String text)
63      {
64          return newNGrams(preProcess(text), 2);
65      }
66  
67      /**
68       * 
69       * @param text
70       * @return
71       */
72      public NGrams trigrams(String text)
73      {
74          return newNGrams(preProcess(text), 3);
75      }
76  
77      /*
78       * Strips punctuation, lowers case, etc depending on settings.
79       */
80      private String preProcess(String text)
81      {
82          if (collapseWhiteSpace)
83              text = text.replaceAll("\\s+", " ");
84  
85          if (stripPunctuation)
86              ; // TODO: strip it
87  
88          if (lowerCase)
89              text = text.toLowerCase();
90  
91          return text;
92      }
93  
94      public boolean isLowerCase()
95      {
96          return lowerCase;
97      }
98  
99      public void setLowerCase(boolean lowerCase)
100     {
101         this.lowerCase = lowerCase;
102     }
103 
104     public boolean isCollapseWhiteSpace()
105     {
106         return collapseWhiteSpace;
107     }
108 
109     public void setCollapseWhiteSpace(boolean collapseWhiteSpace)
110     {
111         this.collapseWhiteSpace = collapseWhiteSpace;
112     }
113 
114     public boolean isStripPunctuation()
115     {
116         return stripPunctuation;
117     }
118 
119     public void setStripPunctuation(boolean stripPunctuation)
120     {
121         this.stripPunctuation = stripPunctuation;
122     }
123 
124 }