1 /*
2 * NGramFactory.java created on Jan 4, 2009.
3 * Copyright 2009 All Eight, LLC
4 *
5 * This file is part of textkit4j.
6 *
7 * textkit4j is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * textkit4j is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with textkit4j. If not, see <http://www.gnu.org/licenses/>.
19 */
20 package net.sf.textkit4j.matching;
21
22 /**
23 * Generates a map of n-grams and their count in the supplied text. The supplied
24 * text can be lower-cased, white-space can be collapsed into a single
25 * white-space, and punctuation can be filtered out, depending on how this is
26 * configured.
27 *
28 * @author rich
29 */
30 public abstract class NGramFactory
31 {
32 boolean collapseWhiteSpace = true; // sensible default?
33
34 boolean stripPunctuation = true; // sensible default?
35
36 boolean lowerCase = true; // sensible default?
37
38 /**
39 * Generates new n-grams.
40 *
41 * @param text
42 * @param n
43 * @return
44 */
45 abstract NGrams newNGrams(String text, int n);
46
47 /**
48 *
49 * @param text
50 * @return
51 */
52 public NGrams unigrams(String text)
53 {
54 return newNGrams(preProcess(text), 1);
55 }
56
57 /**
58 *
59 * @param text
60 * @return
61 */
62 public NGrams bigrams(String text)
63 {
64 return newNGrams(preProcess(text), 2);
65 }
66
67 /**
68 *
69 * @param text
70 * @return
71 */
72 public NGrams trigrams(String text)
73 {
74 return newNGrams(preProcess(text), 3);
75 }
76
77 /*
78 * Strips punctuation, lowers case, etc depending on settings.
79 */
80 private String preProcess(String text)
81 {
82 if (collapseWhiteSpace)
83 text = text.replaceAll("\\s+", " ");
84
85 if (stripPunctuation)
86 ; // TODO: strip it
87
88 if (lowerCase)
89 text = text.toLowerCase();
90
91 return text;
92 }
93
94 public boolean isLowerCase()
95 {
96 return lowerCase;
97 }
98
99 public void setLowerCase(boolean lowerCase)
100 {
101 this.lowerCase = lowerCase;
102 }
103
104 public boolean isCollapseWhiteSpace()
105 {
106 return collapseWhiteSpace;
107 }
108
109 public void setCollapseWhiteSpace(boolean collapseWhiteSpace)
110 {
111 this.collapseWhiteSpace = collapseWhiteSpace;
112 }
113
114 public boolean isStripPunctuation()
115 {
116 return stripPunctuation;
117 }
118
119 public void setStripPunctuation(boolean stripPunctuation)
120 {
121 this.stripPunctuation = stripPunctuation;
122 }
123
124 }