-
Notifications
You must be signed in to change notification settings - Fork 2
/
tm-tweets.r
66 lines (49 loc) · 2.39 KB
/
tm-tweets.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# install the twitteR package if necessary
if(all(rownames(installed.packages()) != 'twitteR')) install.packages('twitteR')
# install tm if necessary
if(all(rownames(installed.packages()) != 'tm')) install.packages('tm')
# install openNLP if necessary
if(all(rownames(installed.packages()) != 'openNLP')) install.packages(c('openNLP', 'openNLPmodels.en'))
# install Snowball if necessary
if(all(rownames(installed.packages()) != 'Snowball')) install.packages('Snowball')
# install RWeka if necessary
if(all(rownames(installed.packages()) != 'RWeka')) install.packages('RWeka')
# load the libraries
library(twitteR)
library(tm)
# library(Snowball)
# library(openNLP)
# library(RWeka)
# get 100 of gerad's friends
friends <- userFriends('gerad', n=100)
# get the tweets for those friends
tweets <- list()
for (friend in friends) tweets <- c(tweets, userTimeline(friend, n=500))
# get just the text from the tweets (ignore other metadata)
tweetsText <- unlist(Map(statusText, tweets))
# tag the parts of speech in the tweets (not useful)
# tweetsPOS <- tagPOS(tweetsText)
# create a corpus from the text of those tweets
tweetsCorpus <- Corpus(VectorSource(tweetsText))
# create a term document matrix for the tweets (doing some basic cleanup along the way)
# here is where we'd use different weighting functions, for example TfIdf
tweetsTDM <- TermDocumentMatrix(tweetsCorpus,
control = list(
# tokenize = tokenize, # use the opennlp POS tokenizer
# tokenize = NGramTokenizer, # use the RWeka NGram tokenizer
# stemming = TRUE, # perform word stemming - not working
stopwords = TRUE, # remove stopwords
tolower = TRUE, # convert terms to lowercase
removePunctuation = TRUE)) # remove punctuation
# calculate term frequencies
tweetsWordFrequencies <- apply(tweetsTDM, 1, sum)
# TODO figure out how to limit this to just the nouns / proper nouns
# grab the 100 most common terms
tweetsFrequentWords <- sort(tweetsWordFrequencies, decreasing=T)[1:100]
# get the matching tweets for the most frequent word
inspect(tweetsCorpus[apply(tweetsTDM[names(tweetsFrequentWords[1])], 1, function(x) x > 0)])
# find words associated with the second most frequent word - this is slow!
findAssocs(tweetsTDM, names(tweetsFrequentWords[2]), 0.2)
# other things to play with from here
# clustering: dissimilarity, hclust, kmeans, cl_agreement, specc
# classiciation: knn, ksvm