-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprediction.R
97 lines (78 loc) · 2.87 KB
/
prediction.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
library(data.table)
library(tm)
library(stringr)
load("unigrams_df.RData")
load("bigrams_df.RData")
load("trigrams_df.RData")
load("fourgrams_df.RData")
sent1 <- "The guy in front of me just bought a pound of bacon a bouquet and a case of";
sent2 <- "You're the reason why I smile everyday. Can you follow me please? It would mean the"
sent3 <- "Very early observations on the Bills game: Offense still struggling but the"
sent4 <- "Be grateful for the good times and keep the faith during the"
cleanSentence <- function(sentence) {
Encoding(sentence) <- "UTF-8";
badwords_path <- "datasets/badwords.txt"
con2 <- file(badwords_path, open="r")
badwords <- readLines(con2)
close(con2)
rm(con2)
#remove non alphanum chars
sentence <- gsub("[^[:alpha:][:space:][:punct:]]", "", sentence);
simple_corpus <- VCorpus(VectorSource(sentence))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
simple_corpus <- tm_map(simple_corpus, toSpace, "@[^\\s]+") #remove mentions
simple_corpus <- tm_map(simple_corpus, toSpace, "#[^\\s]+") #remove hashtags
simple_corpus <- tm_map(simple_corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+") #remove links or URLs
simple_corpus <- tm_map(simple_corpus, tolower)
simple_corpus <- tm_map(simple_corpus, removePunctuation)
simple_corpus <- tm_map(simple_corpus, removeNumbers)
simple_corpus <- tm_map(simple_corpus, stripWhitespace)
simple_corpus <- tm_map(simple_corpus, removeWords, badwords)
sentence <- simple_corpus[[1]]
rm(badwords, simple_corpus)
return(sentence)
}
predictBasedNGram <- function(sentArray, subsetIndex, df) {
search <- paste(sentArray[subsetIndex], collapse = " ");
search <- paste("^", search, sep="");
tmp <- df[grep(search, df$ngram), ];
if(length(tmp[,1]) > 0) {
return(tmp[1,1])
} else {
return(-1)
}
}
predictNextWord <- function(sentence) {
print(sentence)
sentArray <- unlist(strsplit(sentence, split=" "));
sentLen <- length(sentArray);
sentence <- cleanSentence(sentence);
subsetIndex <- (sentLen - 2):sentLen
res <- predictBasedNGram(sentArray, subsetIndex, fourgram.freq);
if(res != -1) {
print("return in fourgram evaluation")
return(word(res, -1))
}
subsetIndex <- (sentLen - 1):sentLen
res <- predictBasedNGram(sentArray, subsetIndex, trigram.freq);
if(res != -1) {
print("return in trigram evaluation")
return(word(res, -1))
}
subsetIndex <- sentLen
res <- predictBasedNGram(sentArray, subsetIndex, bigram.freq);
if(res != -1) {
print("return in bigram evaluation")
return(word(res, -1))
}
print("return in unigram evaluation")
return(unigram.freq[1,1])
}
predictedWord <- predictNextWord(sent1);
print(predictedWord)
predictedWord <- predictNextWord(sent2);
print(predictedWord)
predictedWord <- predictNextWord(sent3);
print(predictedWord)
predictedWord <- predictNextWord(sent4);
print(predictedWord)