1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package net.sf.textkit4j.matching;
21
22
23
24
25
26
27
28
29
30 public abstract class NGramFactory
31 {
32 boolean collapseWhiteSpace = true;
33
34 boolean stripPunctuation = true;
35
36 boolean lowerCase = true;
37
38
39
40
41
42
43
44
45 abstract NGrams newNGrams(String text, int n);
46
47
48
49
50
51
52 public NGrams unigrams(String text)
53 {
54 return newNGrams(preProcess(text), 1);
55 }
56
57
58
59
60
61
62 public NGrams bigrams(String text)
63 {
64 return newNGrams(preProcess(text), 2);
65 }
66
67
68
69
70
71
72 public NGrams trigrams(String text)
73 {
74 return newNGrams(preProcess(text), 3);
75 }
76
77
78
79
80 private String preProcess(String text)
81 {
82 if (collapseWhiteSpace)
83 text = text.replaceAll("\\s+", " ");
84
85 if (stripPunctuation)
86 ;
87
88 if (lowerCase)
89 text = text.toLowerCase();
90
91 return text;
92 }
93
94 public boolean isLowerCase()
95 {
96 return lowerCase;
97 }
98
99 public void setLowerCase(boolean lowerCase)
100 {
101 this.lowerCase = lowerCase;
102 }
103
104 public boolean isCollapseWhiteSpace()
105 {
106 return collapseWhiteSpace;
107 }
108
109 public void setCollapseWhiteSpace(boolean collapseWhiteSpace)
110 {
111 this.collapseWhiteSpace = collapseWhiteSpace;
112 }
113
114 public boolean isStripPunctuation()
115 {
116 return stripPunctuation;
117 }
118
119 public void setStripPunctuation(boolean stripPunctuation)
120 {
121 this.stripPunctuation = stripPunctuation;
122 }
123
124 }