Last updated: 2018-01-02
Code version: 800f055
sessionInfo()
R version 3.4.2 (2017-09-28)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.1
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
locale:
[1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] secret_1.0.0
loaded via a namespace (and not attached):
[1] Rcpp_0.12.14 digest_0.6.12 rprojroot_1.2 assertthat_0.2.0
[5] jsonlite_1.5 backports_1.1.1 git2r_0.19.0 magrittr_1.5
[9] evaluate_0.10.1 stringi_1.1.6 curl_3.0 rmarkdown_1.8
[13] tools_3.4.2 stringr_1.2.0 yaml_2.1.15 compiler_3.4.2
[17] htmltools_0.3.6 openssl_0.9.9 knitr_1.17
The purpose of this section is to collate the supplied NOW corpus files into a SQL
database. The supplied data comes in three table formats, as described in this documentation:
/db
)dict
file in dir /shared
)sources
files in dir /shared
)N.B. original article data held in dirs wlp/
and text/
are currently not being incorporated into the database.
The workflow is designed to be run as batch scripts on iceberg/sharc due to the large size or the files and eventual database. I’ll be making a makefile to automate the whole process too. The scripts are very specific to the task, with hardwired absolute paths and automatic user detection to ensure only authorised users can run them.
The workflow is currently split up into three scripts:
m01_proc-data.sh
: copying and unzipping the data into an ooominds1 user directory (can only be run if the user has a personal dir in the ooominds1/User
dir.)m02_make-db.sh
: creating the database in an ooominds1 user directorym03_pop-db.R
: populating the databaseThe following vignette walks a user through the code used to accomplish this
NB you can mount the shared ooominds1
directory using the folllowing shell command
open 'smb://user:password@uosfstore.shef.ac.uk/shared/ooominds1'
m01_proc-data.sh
: Copy and extract data using bash scriptuser
details are supplied automatically as an argument to the script on sharc/iceberg when run in batch mode (see qsub
script m01_proc-data.sge
user=$1
echo $user
## # copy a1517_now db & shared folders into user folder
cp -a /shared/ooominds1/Shared/corpus.byu.edu/a1517_now/db/ '/shared/ooominds1/User/'$user'/a1517_now/'
cp -a /shared/ooominds1/Shared/corpus.byu.edu/a1517_now/shared/ '/shared/ooominds1/User/'$user'/a1517_now/'
# unzip all .zip files into their corresponding folder.
find '/shared/ooominds1/User/'$user'/a1517_now/' -type f -iname "*.zip" | while read filename
do unzip -o -d "`dirname "$filename"`" "$filename"
done
m02_make-db.sh
: Create databaseSet user
and db_path
user=$1
echo $user
db_path='/shared/ooominds1/User/'$user'/now_db'
db_path
rm -f $db_path
db
and create tables in splite3
The database will is created by the call to sqlite3
at db_path
. In sqlite3
, create the 3 tables
sqlite3 $db_path <<EOF
CREATE TABLE lexicon (
wordID INTEGER PRIMARY KEY,
word TEXT,
lemma TEXT,
PoS TEXT
);
CREATE TABLE corpus (
textID INTEGER,
ID INTEGER PRIMARY KEY,
wordID INTEGER
);
CREATE TABLE sources (
textID INTEGER PRIMARY KEY,
ID INTEGER,
date TEXT,
country TEXT,
source TEXT,
url TEXT,
textTitle TEXT
);
CREATE TABLE del_lexicon (
wordID INTEGER PRIMARY KEY,
word TEXT,
lemma TEXT,
PoS TEXT,
remove_call TEXT
);
EOF
m03_pop-db.R
- Populate dbdepend <- c("DBI", "RSQLite", "dplyr", "tm")
if (!require("pacman")) install.packages("pacman")
pacman::p_load(depend, character.only = T)
pop_db
- populate db table custom functionThese are sourced from R/pop-db_functions.R
:
source(paste0("/shared/ooominds1/User/", user, "/scripts/R/pop-db_functions.R"))
populate_nowdb <- function(db, table, source_dir, pattern = ".txt",
index = NULL, nrows = -1){
# internal function to ensure only raw data corresponding to rows with numeric
# primary keys are included
trim2pk <- function(db, tmp, table){
tab_dtypes <- dbGetQuery(db, paste0("PRAGMA table_info(", table,")"))
pk <- which(tab_dtypes$pk == 1)
if(!is.numeric(tmp[,pk])){
if(is.factor(tmp[,pk])){stop("primary key is factor instead of text. Likely read.delim setting error")}
tmp <- tmp[suppressWarnings(!is.na(as.numeric(tmp[,pk]))),]
tmp[,pk] <- as.numeric(tmp[,pk])
row.names(tmp) <- NULL
}
tmp
}
# get table header
table.names <- dbListFields(db, table)
# if importing corpus files, extract del_lexicon to clean with
if(table == "corpus"){
remove_df <- dbReadTable(conn = db, name = "del_lexicon")
if(nrow(remove_df) == 0){stop("'del_lexicon' table is empty. Ensure 'lexicon' is populated
before 'corpus'")}
}
# get vector of pattern matched files to import
filenames <- list.files(source_dir, pattern = pattern, full.names = T)
for(filename in filenames) {
cat("importing '", filename, "(", which(filenames == filename), "of",
length(filenames), ")' \n")
tmp <- read.delim(filename, header = F, quote=NULL, nrows = nrows,
comment='', skipNul = TRUE, stringsAsFactors = F,
strip.white = T) %>% setNames(table.names)
# convert date format to ISO
if(table == "sources"){
tmp$date <- paste0("20", tmp$date)
}
# Ensure only raw data is included - ie weird headers are removed
tmp <- trim2pk(db, tmp, table)
tmp <- switch(table,
"lexicon" = clean_lexicon(db, tmp),
"corpus" = trim_table(tmp, remove_df),
"sources" = tmp)
# write file to table
dbWriteTable(db, name = table, value = tmp, append = T,
header = F)
}
if(!is.null(index)){
for(i in index){
dbExecute(db, paste0("CREATE INDEX ",i,"_index ON ",table," (",i,")"))
cat("index --", paste0(i,"_index"), " -- created \n")
}
}
cat("### ---- import of", length(filenames), "files into table **", table,
"** complete ---- ###", "\n", "\n")
}
In development was getting warnings: 1: In scan(file = file, what = what, sep = sep, quote = quote, dec = dec, : EOF within quoted string
-> implemented solution found here
2: In scan(file = file, what = what, sep = sep, quote = quote, dec = dec, : embedded nul(s) found in input
-> implemented solution found here
#' split_lexicon helper function:
#'
#' splits a table into clean and words to be removed
#' @details the fucntion currently removes NAs, punctutation marks and paragraph
#' indices and returns list o f length 2 containing:
#' - a df containing the clean lexicon (tmp)
#' - a df of removed words (remove_df)
split_lexicon <- function(tmp){
# remove NAs
remove_df <- tmp[is.na(tmp$word),] %>%
mutate(remove_call = "na")
tmp <- trim_table(tmp, remove_df)
# remove paragraph ids
remove_df <- tmp[grep("^@{2}", tmp$word),] %>% mutate(remove_call = "paragraph_id") %>% bind_rows(remove_df)
tmp <- trim_table(tmp, remove_df)
#remove html tags
#tmp <- tmp %>% mutate(word = gsub("&*", "---%%%---", word))
#while(length(grep("---%%%---amp", tmp$word)) > 0){
#tmp <- tmp %>% mutate(word = gsub("---%%%---amp", "---%%%---", word))
#}
#tags <- tmp[grep(".*&.*", tmp$word),"word"] %>% gsub("^.*&", "", .) %>%
#gsub(";.*$", "", .) %>% unique()
remove_df <- tmp[grep("&", tmp$word),] %>% mutate(remove_call = "html_tags") %>% bind_rows(remove_df)
tmp <- trim_table(tmp, remove_df)
remove_df <- tmp[grep("<.*>", tmp$word),] %>% mutate(remove_call = "html_tags") %>% bind_rows(remove_df)
tmp <- trim_table(tmp, remove_df)
remove_df <- tmp[grep(".*;TOOLONG", tmp$word),] %>% mutate(remove_call = "html_tags") %>% bind_rows(remove_df)
tmp <- trim_table(tmp, remove_df)
remove_df <- tmp[grep("^/", tmp$word),] %>% mutate(remove_call = "slash_start") %>% bind_rows(remove_df)
tmp <- trim_table(tmp, remove_df)
#tmp <- tmp %>% mutate(word = gsub("---%%%---", "&", word))
#
# remove punctuations
remove_df <- tmp[removePunctuation(tmp$word, preserve_intra_word_dashes = T) %>% stripWhitespace() == "",] %>%
mutate(remove_call = "puntuations") %>% bind_rows(remove_df)
tmp <- trim_table(tmp, remove_df)
list(tmp = tmp, remove_df = remove_df)
}
#' trim helper function
#'
#' trim table of words marked to be removed in remove_df
trim_table <- function(tmp, remove_df){
anti_join(tmp, remove_df, by = "wordID")
}
#' Clean lexicon
#'
#' trim table of words marked to be removed in remove_df
#' @param db database connection
#' @param tmp table to clean
#' @details the fucntion currently removes NAs, punctutation marks
#' @export
clean_lexicon <- function(db, tmp){
out <- split_lexicon(tmp)
dbWriteTable(db, name = "del_lexicon", value = out$remove_df, append = T,
header = F)
out$tmp
}
Assign the user source_dir
and db_path
(from default or supplied). Echo settings in log.
args <- commandArgs(trailingOnly = TRUE)
if(length(args) == 0) {
user <- "ac1adk"}else{user <- args[1]}
if(length(args) >= 2) {source_dir <- args[2]}else{
source_dir <- paste0("/shared/ooominds1/User/", user, "/a1517_now/")}
if(length(args) >= 3) {db_path <- args[3]}else{
db_path <- paste0("/shared/ooominds1/User/", user, "/now_db")}
if(length(args) >= 4) {shared_db_path <- args[4]}else{
shared_db_path <- "/shared/ooominds1/Shared/corpus.byu.edu/a1517_now/"}
cat("user:", user, "\n",
"source_dir:", source_dir, "\n",
"db_path:", db_path, "\n")
db
db <- dbConnect(RSQLite::SQLite(), dbname = db_path)
mount_ooominds1_volume()
dbListFields(db, "corpus")
[1] "textID" "ID" "wordID"
dbListFields(db, "lexicon")
[1] "wordID" "word" "lemma" "PoS"
dbListFields(db, "sources")
[1] "textID" "ID" "date" "country" "source" "url"
[7] "textTitle"
dbListTables(db)
[1] "corpus" "del_lexicon" "lexicon" "sources"
db
tableslexicon
populate_nowdb(db, table = "lexicon", source_dir = paste0(source_dir, "shared"),
pattern = "dic.txt", index = "PoS")
sources
populate_nowdb(db, table = "sources", source_dir = paste0(source_dir, "shared"),
pattern = "sources_pt.\\.txt", index = c("source", "date"))
corpus
populate_nowdb(db, table = "corpus", source_dir = paste0(source_dir, "db"),
index = c("textID", "wordID"))
dbDisconnect(db)
This R Markdown site was created with workflowr