public class Twokenize
extends java.lang.Object
| Modifier and Type | Field and Description |
|---|---|
static java.lang.String |
Email |
static java.lang.String |
emoticon |
static java.lang.String |
url |
| Constructor and Description |
|---|
Twokenize() |
| Modifier and Type | Method and Description |
|---|---|
static java.lang.String |
normalizeTextForTagger(java.lang.String text)
Twitter text comes HTML-escaped, so unescape it.
|
static java.lang.String |
OR(java.lang.String... parts) |
static java.lang.String |
splitEdgePunct(java.lang.String input) |
static java.lang.String |
squeezeWhitespace(java.lang.String input)
"foo bar " => "foo bar"
|
static java.util.List<java.lang.String> |
tokenize(java.lang.String text)
Assume 'text' has no HTML escaping.
|
static java.util.List<java.lang.String> |
tokenizeRawTweetText(java.lang.String text)
This is intended for raw tweet text -- we do some HTML entity unescaping before running the tagger.
|
public static java.lang.String url
public static java.lang.String emoticon
public static java.lang.String Email
public static java.lang.String OR(java.lang.String... parts)
public static java.lang.String splitEdgePunct(java.lang.String input)
public static java.lang.String squeezeWhitespace(java.lang.String input)
public static java.util.List<java.lang.String> tokenize(java.lang.String text)
public static java.lang.String normalizeTextForTagger(java.lang.String text)
public static java.util.List<java.lang.String> tokenizeRawTweetText(java.lang.String text)