# Data Science Specialization Capstone Project (1/2): The algorithm

·

This is the algorithm built for my Capstone Project of the Data Science Specialization by Johns Hopkins University on Coursera.
The goal of the Capstone Project was to implement a model to predict a word given one or more words and to develop a web app to use it.
Below you can find how I created the tables used for the app.
In the next post there will be the code for the app.

### IMPORT DATA

``````#twitter

#news
conn <- file("C:/Users/figinim/Documents/Studies/Capstone Project/Coursera-SwiftKey/final/en_US/en_US.news.txt", open="rb")
close(conn)
rm(conn)

#blogs
blogs <- iconv(blogs, from = "latin1", to = "UTF-8", sub="")

``````

### CLEAN DATASET

``````#clean the Data
DataClean <- gsub("#"," ", DataComplete)

DataClean <- gsub("\""," ", DataClean)
DataClean <- gsub("-"," ", DataClean)
DataClean <- gsub("_"," ", DataClean)
DataClean <- gsub(","," ", DataClean)
DataClean <- gsub(","," ", DataClean)
DataClean <- gsub(","," ", DataClean)

#split each line When there is a punctuation (except for comma).
DataClean <- unlist(strsplit(DataClean, ".", fixed=TRUE))
DataClean <- unlist(strsplit(DataClean, "!", fixed=TRUE))
DataClean <- unlist(strsplit(DataClean, "?", fixed=TRUE))
DataClean <- unlist(strsplit(DataClean, ":", fixed=TRUE))
DataClean <- unlist(strsplit(DataClean, ";", fixed=TRUE))
DataClean <- unlist(strsplit(DataClean, "(", fixed=TRUE))
DataClean <- unlist(strsplit(DataClean, ")", fixed=TRUE))
DataClean <- unlist(strsplit(DataClean, "/", fixed=TRUE))

#insert symbol | after each sentence
DataClean <- paste(DataClean,"|")

#convert in lower all the letter
DataClean <- tolower(DataClean)

#delete white spaces before and after any sentence
DataClean <- trimws(DataClean)

#convert double spaces or more in a single one
DataClean <- gsub("\\s+", " ", DataClean)

#divide the single words
DataClean <- unlist(strsplit(DataClean, "\\s"))
``````

### BIGRAM TABLE

``````library(quanteda)

#Two consecutive words
Data2Gram <- tokens_ngrams(DataClean, concatenator=" ")

#Delete couples with "|"
Data2Gram <- Data2Gram[!grepl("\\|", Data2Gram)]

#convert in data.frame
DF2gram <- data.frame(unlist(Data2Gram))

#find frequencies and order
Freq2Gram <- as.data.frame(table(DF2gram))
Freq2Gram <- Freq2Gram[order(-Freq2Gram\$Freq),]

#separate words
Freq2Table <- data.frame(do.call('rbind', strsplit(as.character(Freq2Gram\$DF2gram),' ',fixed=TRUE)),Freq2Gram\$Freq)

#take the top for every word
Total2Gram <- Freq2Table[order(Freq2Table\$Freq2Gram.Freq,decreasing=T),]
Top2Gram <- Total2Gram[!duplicated(Total2Gram\$X1),]

#second word
library(dplyr)
Rest2Gram <- anti_join(Total2Gram, Top2Gram, by = c("X1" = "X1", "X2" = "X2"))
Rest2Gram <- Rest2Gram[order(Rest2Gram\$Freq2Gram.Freq,decreasing=T),]
Second2Gram <- Rest2Gram[!duplicated(Rest2Gram\$X1),]

#third word
Other2Gram <- anti_join(Rest2Gram, Second2Gram, by = c("X1" = "X1", "X2" = "X2"))
Other2Gram <- Other2Gram[order(Other2Gram\$Freq2Gram.Freq,decreasing=T),]
Third2Gram <- Other2Gram[!duplicated(Other2Gram\$X1),]

#table with 3 top bigram
WordBigram <- left_join(Top2Gram, Second2Gram, by=c("X1"="X1"))
WordBigram <- left_join(WordBigram, Third2Gram, by=c("X1"="X1"))
colnames(WordBigram) <- c("Start","First","Freq1","Second","Freq2","Third","Freq3")

#delete rows with starting word contains "<" or ">"
WordBigram <- WordBigram[!grepl("<", WordBigram\$Start),]
WordBigram <- WordBigram[!grepl(">", WordBigram\$Start),]

#clear other columns contain "<" or ">"
WordBigram <- as.data.frame(sapply(WordBigram,sub,pattern='<',replacement=NA))
WordBigram <- as.data.frame(sapply(WordBigram,sub,pattern='>',replacement=NA))

#SEARCH!
WordBigram %>% filter(Start == "word") %>% select(First,Second,Third)
``````

### TRIGRAM TABLE

``````#not enough memory for trigram with compete data in my notebook... only from 30 percent is OK
set.seed(190587)
DataClean <- sample(DataClean, length(DataClean)*0.3)

#Three consecutive words
Data3Gram <- tokens_ngrams(DataClean, n = 3L, concatenator=" ")

#Delete couples with "|"
Data3Gram <- Data3Gram[!grepl("\\|", Data3Gram)]

#convert in data.frame
DF3gram <- data.frame(unlist(Data3Gram))

#find frequencies and order
Freq3Gram <- as.data.frame(table(DF3gram))
Freq3Gram <- Freq3Gram[order(-Freq3Gram\$Freq),]
Freq3Table <- data.frame(do.call('rbind', strsplit(as.character(Freq3Gram\$DF3gram),' ',fixed=TRUE)),Freq3Gram\$Freq)

#filter only if frequency > 1
Freq3Table <- filter(Freq3Table, Freq3Gram.Freq>1)

#separate bigram for other word
Freq3Table <- mutate(Freq3Table,X=paste(X1,X2))
Freq3Table <- select(Freq3Table,X,X3,Freq3Gram.Freq)

#take the top for every word
Total3Gram <- Freq3Table[order(Freq3Table\$Freq3Gram.Freq,decreasing=T),]
Top3Gram <- Total3Gram[!duplicated(Total3Gram\$X),]

#second word
Rest3Gram <- anti_join(Total3Gram, Top3Gram, by = c("X" = "X", "X3" = "X3"))
Rest3Gram <- Rest3Gram[order(Rest3Gram\$Freq3Gram.Freq,decreasing=T),]
Second3Gram <- Rest3Gram[!duplicated(Rest3Gram\$X),]

#third word
Other3Gram <- anti_join(Rest3Gram, Second3Gram, by = c("X" = "X", "X3" = "X3"))
Other3Gram <- Other3Gram[order(Other3Gram\$Freq3Gram.Freq,decreasing=T),]
Third3Gram <- Other3Gram[!duplicated(Other3Gram\$X),]

#table with 3 top bigram
WordTrigram <- left_join(Top3Gram, Second3Gram, by=c("X"="X"))
WordTrigram <- left_join(WordTrigram, Third3Gram, by=c("X"="X"))
colnames(WordTrigram) <- c("Start","First","Freq1","Second","Freq2","Third","Freq3")

#delete rows with starting word contains "<" or ">"
WordTrigram <- WordTrigram[!grepl("<", WordTrigram\$Start),]
WordTrigram <- WordTrigram[!grepl(">", WordTrigram\$Start),]

#clear other columns contain "<" or ">"
WordTrigram <- as.data.frame(sapply(WordTrigram,sub,pattern='<',replacement=NA))
WordTrigram <- as.data.frame(sapply(WordTrigram,sub,pattern='>',replacement=NA))

#SEARCH!
WordTrigram %>% filter(Start == "as soon") %>% select(First,Second,Third)
``````

``````#not enough memory for quadrigram with compete data in my notebook... only from 25 percent is OK
set.seed(190587)
DataClean <- sample(DataClean, length(DataClean)*0.25)

#Three consecutive words
Data4Gram <- tokens_ngrams(DataClean, n = 4L, concatenator=" ")

#Delete couples with "|"
Data4Gram <- Data4Gram[!grepl("\\|", Data4Gram)]

#convert in data.frame
DF4gram <- data.frame(unlist(Data4Gram))

#find frequencies and order
Freq4Gram <- as.data.frame(table(DF4gram))
Freq4Gram <- Freq4Gram[order(-Freq4Gram\$Freq),]

#separate trigram for other word, take only if there is > 1 frequency
Freq4GramFilter <- filter(Freq4Gram, Freq>1)
Freq4Table <- data.frame(do.call('rbind', strsplit(as.character(Freq4GramFilter\$DF4gram),' ',fixed=TRUE)),Freq4GramFilter\$Freq)
Freq4Table <- mutate(Freq4Table,X=paste(X1,X2,X3))
Freq4Table <- select(Freq4Table,X,X4,Freq4GramFilter.Freq)

#take the top for every word
Total4Gram <- Freq4Table[order(Freq4Table\$Freq4GramFilter.Freq,decreasing=T),]
Top4Gram <- Total4Gram[!duplicated(Total4Gram\$X),]

#second word
Rest4Gram <- anti_join(Total4Gram, Top4Gram, by = c("X" = "X", "X4" = "X4"))
Rest4Gram <- Rest4Gram[order(Rest4Gram\$Freq4GramFilter.Freq,decreasing=T),]
Second4Gram <- Rest4Gram[!duplicated(Rest4Gram\$X),]

#third word
Other4Gram <- anti_join(Rest4Gram, Second4Gram, by = c("X" = "X", "X4" = "X4"))
Other4Gram <- Other4Gram[order(Other4Gram\$Freq4GramFilter.Freq,decreasing=T),]
Third4Gram <- Other4Gram[!duplicated(Other4Gram\$X),]

#table with 3 top bigram

#delete rows with starting word contains "<" or ">"

#clear other columns contain "<" or ">"

#SEARCH!
WordQuadrigram %>% filter(Start == "as soon as") %>% select(First,Second,Third)
``````