-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess.r
135 lines (116 loc) · 6.02 KB
/
process.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
process.data <- function(writer.name, file.location) {
# Search for dicts
char.dict.location <- "/Users/Chirag/Documents/Dropbox/Programming/2013-08/Which_Blogger/char_dict.txt"
word.dict.location <- "/Users/Chirag/Documents/Dropbox/Programming/2013-08/Which_Blogger/word_dict.txt"
# Create dict data frames if it does not exist
# Read in dict data frames if it does exist
if (file.exists(char.dict.location) == FALSE) {
char.dict <- data.frame(as.character(0), stringsAsFactors = FALSE)
char.dict$col <- 0
colnames(char.dict) <- c("char", writer.name)
word.dict <- data.frame(as.character(1), stringsAsFactors = FALSE)
word.dict$col <- 0
colnames(word.dict) <- c("len", writer.name)
} else {
char.dict <- read.csv(char.dict.location, sep = ",")
char.dict$char <- as.character(char.dict$char)
char.dict$X <- NULL
word.dict <- read.csv(word.dict.location, sep = ",")
word.dict$len <- as.numeric(word.dict$len)
word.dict$X <- NULL
}
# Create author column if it does not exist
# Get author column index if it does exist
if (length(which(colnames(char.dict) == writer.name)) == 0) {
char.dict$new <- 0
colnames(char.dict)[ncol(char.dict)] <- writer.name
column <- ncol(char.dict)
word.dict$new <- 0
colnames(word.dict)[ncol(word.dict)] <- writer.name
column <- ncol(word.dict)
} else {
column <- which(colnames(char.dict) == writer.name)
}
# Read all text from input file
original <- readChar(file.location, file.info(file.location)$size)
text <- original
# Modify text to consolidate similar characters
text <- gsub("[a-z]", "a", text) # Make all lowercase letters a's
text <- gsub("[A-Z]", "A", text) # Make all uppercase letters A's
text <- gsub("\\d+", "0", text) # Make all numbers 0's
text <- gsub("\n", " ", text) # Change new lines to spaces
text <- gsub("–", "-", text) # Change en dashes to hyphens
text <- gsub("—", "-", text) # Change em dashes to hyphens
text <- gsub("'", "‘", text) # Change ugly apostrophe to open apostrophe
text <- gsub("’", "‘", text) # Change close apostrophe to open apostrophe
text <- gsub("″", "“", text) # Change ugly quote to open quote
text <- gsub("”", "“", text) # Change close quote to open quote
text <- gsub(")", "(", text) # Change close parenthesis to open parenthesis
text <- gsub("]", "[", text) # Change close bracket to open bracket
text <- gsub("}", "{", text) # Change close curly brace to open curly brace
text <- gsub(">", "<", text) # Change greater than to less than
text <- gsub("\\", "*", text, fixed = TRUE) # Change forward slash to backslash
text <- gsub("&", "*", text) # Change ampersand to asterisk
text <- gsub("#", "*", text) # Change pound to asterisk
text <- gsub("%", "*", text) # Change percent to asterisk
text <- gsub("`", "*", text) # Change weird thing to asterisk
text <- gsub("^", "*", text, fixed = TRUE) # Change caret to asterisk
text <- gsub("+", "*", text, fixed = TRUE) # Change plus to asterisk
text <- gsub("|", "*", text, fixed = TRUE) # Change horizontal line to asterisk
text <- gsub("<", "*", text) # Change less than to asterisk
text <- gsub("=", "*", text) # Change equals to asterisk
text <- gsub("@", "*", text) # Change at to asterisk
text <- gsub("$", "*", text, fixed = TRUE) # Change dollar to asterisk
text <- gsub("[", "*", text, fixed = TRUE) # Change open bracket to asterisk
text <- gsub("{", "*", text, fixed = TRUE) # Change open curly brace to asterisk
# Split text into characters
text <- strsplit(text, split = "")[[1]]
# Input text from data file into dictionary
for (index in 1:length(text)) {
char <- text[index]
row <- which(char.dict$char == char)
if (length(row) > 0) { # If key exists, increment count
char.dict[row, column] <- char.dict[row, column] + 1
} else { # If key does not exist, add new row with count = 1
char.dict[nrow(char.dict) + 1, ] <- 0
char.dict[nrow(char.dict), 1] <- char
char.dict[nrow(char.dict), column] <- 1
}
}
# Reload text
text <- original
# Modify text to consolidate similar characters
text <- gsub("'", "", text) # Change ugly apostrophe to blank
text <- gsub("’", "", text) # Change close apostrophe to blank
text <- gsub("‘", "", text) # Change open apostrophe to blank
text <- gsub("[^a-zA-Z]", " ", text) # Remove everything but letters
# Split text into words
text <- strsplit(text, split = " ")[[1]]
# Remove words that are 0 characters wrong
text <- text[nchar(text) > 0]
# Input text from data file into dictionary
for (index in 1:length(text)) {
word <- text[index]
row <- which(word.dict$len == nchar(word))
if (length(row) > 0) { # If key exists, increment count
word.dict[row, column] <- word.dict[row, column] + 1
} else { # If key does not exist, add new row with count = 1
word.dict[nrow(word.dict) + 1, ] <- 0
word.dict[nrow(word.dict), 1] <- nchar(word)
word.dict[nrow(word.dict), column] <- 1
}
}
# Alphabetize dicts
word.dict <- word.dict[order(word.dict$len), ]
char.dict <- char.dict[order(char.dict$char), ]
# Write dicts
write.csv(char.dict, char.dict.location)
write.csv(word.dict, word.dict.location)
}
names <- c("Aaras", "Ankit", "Anuj", "Chirag", "Elina", "Mihir", "Nakul", "Reena", "Ruchi", "Shrinidhi")
for (i in 1:length(names)) {
writer.name <- names[i]
file.location <- paste("/Users/Chirag/Documents/Dropbox/Programming/2013-08/Which_Blogger/Wordpress/",
tolower(writer.name), ".txt", sep = "")
process.data(writer.name, file.location)
}