Last updated: 2018-01-02

Code version: 800f055

Session information

sessionInfo()
R version 3.4.2 (2017-09-28)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.1

Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib

locale:
[1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] secret_1.0.0

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.14     digest_0.6.12    rprojroot_1.2    assertthat_0.2.0
 [5] jsonlite_1.5     backports_1.1.1  git2r_0.19.0     magrittr_1.5    
 [9] evaluate_0.10.1  stringi_1.1.6    curl_3.0         rmarkdown_1.8   
[13] tools_3.4.2      stringr_1.2.0    yaml_2.1.15      compiler_3.4.2  
[17] htmltools_0.3.6  openssl_0.9.9    knitr_1.17      


Background

The purpose of this section is to collate the supplied NOW corpus files into a SQL database. The supplied data comes in three table formats, as described in this documentation:

  1. a corpus table: containing words extracted from the online articles, one row for each occurence (files in dir /db)
  2. a lexicon table: containing further details on the total set of words in the corpus, one row for each unique word (dict file in dir /shared)
  3. a sources table: containing the metadata on the articles from which words were extracted, one row for each unique article (sources files in dir /shared)

N.B. original article data held in dirs wlp/ and text/ are currently not being incorporated into the database.


Workflow

The workflow is designed to be run as batch scripts on iceberg/sharc due to the large size or the files and eventual database. I’ll be making a makefile to automate the whole process too. The scripts are very specific to the task, with hardwired absolute paths and automatic user detection to ensure only authorised users can run them.

The workflow is currently split up into three scripts:

  • m01_proc-data.sh: copying and unzipping the data into an ooominds1 user directory (can only be run if the user has a personal dir in the ooominds1/User dir.)
  • m02_make-db.sh: creating the database in an ooominds1 user directory
  • m03_pop-db.R: populating the database

The following vignette walks a user through the code used to accomplish this

NB you can mount the shared ooominds1 directory using the folllowing shell command

open 'smb://user:password@uosfstore.shef.ac.uk/shared/ooominds1'



m01_proc-data.sh: Copy and extract data using bash script

Set user

user details are supplied automatically as an argument to the script on sharc/iceberg when run in batch mode (see qsub script m01_proc-data.sge

user=$1
echo $user

Copy

## # copy a1517_now db & shared folders into user folder
cp -a /shared/ooominds1/Shared/corpus.byu.edu/a1517_now/db/ '/shared/ooominds1/User/'$user'/a1517_now/'
cp -a /shared/ooominds1/Shared/corpus.byu.edu/a1517_now/shared/ '/shared/ooominds1/User/'$user'/a1517_now/'

unzip data

# unzip all .zip files into their corresponding folder.
find '/shared/ooominds1/User/'$user'/a1517_now/' -type f -iname "*.zip" | while read filename
  do unzip -o -d "`dirname "$filename"`" "$filename"
done



m02_make-db.sh: Create database

Set user and db_path

user=$1
echo $user
db_path='/shared/ooominds1/User/'$user'/now_db'

Delete any existing database at db_path

rm -f $db_path

Open connection to create db and create tables in splite3

The database will is created by the call to sqlite3 at db_path. In sqlite3, create the 3 tables

sqlite3 $db_path <<EOF

CREATE TABLE lexicon (
wordID INTEGER PRIMARY KEY,
word TEXT,
lemma TEXT,
PoS TEXT
);

CREATE TABLE corpus (
textID INTEGER,
ID INTEGER PRIMARY KEY,
wordID INTEGER
);

CREATE TABLE sources (
textID INTEGER PRIMARY KEY,
ID INTEGER,
date TEXT,
country TEXT,
source TEXT,
url TEXT,
textTitle TEXT
);

CREATE TABLE del_lexicon (
wordID INTEGER PRIMARY KEY,
word TEXT,
lemma TEXT,
PoS TEXT,
remove_call TEXT
);

EOF


m03_pop-db.R - Populate db

Connect to db

depend <- c("DBI", "RSQLite", "dplyr", "tm")
if (!require("pacman")) install.packages("pacman")
pacman::p_load(depend, character.only = T)

pop_db - populate db table custom function

These are sourced from R/pop-db_functions.R:

source(paste0("/shared/ooominds1/User/", user, "/scripts/R/pop-db_functions.R"))
populate_nowdb <- function(db, table, source_dir, pattern = ".txt",
                           index = NULL, nrows = -1){

  # internal function to ensure only raw data corresponding to rows with numeric
  # primary keys are included
  trim2pk <- function(db, tmp, table){
    tab_dtypes <- dbGetQuery(db, paste0("PRAGMA table_info(", table,")"))
    pk <- which(tab_dtypes$pk == 1)

    if(!is.numeric(tmp[,pk])){
      if(is.factor(tmp[,pk])){stop("primary key is factor instead of text. Likely read.delim setting error")}
      tmp <- tmp[suppressWarnings(!is.na(as.numeric(tmp[,pk]))),]
      tmp[,pk] <- as.numeric(tmp[,pk])
      row.names(tmp) <- NULL
    }
    tmp
  }

  # get table header
  table.names <- dbListFields(db, table)

  # if importing corpus files, extract del_lexicon to clean with
  if(table == "corpus"){
    remove_df <- dbReadTable(conn = db, name = "del_lexicon")
    if(nrow(remove_df) == 0){stop("'del_lexicon' table is empty. Ensure 'lexicon' is populated
                                  before 'corpus'")}
    }

  # get vector of pattern matched files to import
  filenames <- list.files(source_dir, pattern = pattern, full.names = T)

  for(filename in filenames) {
    cat("importing '", filename, "(", which(filenames == filename), "of",
        length(filenames), ")' \n")

    tmp <- read.delim(filename, header = F, quote=NULL, nrows = nrows,
                      comment='', skipNul = TRUE, stringsAsFactors = F,
                      strip.white = T) %>% setNames(table.names)

    # convert date format to ISO
    if(table == "sources"){
      tmp$date <- paste0("20", tmp$date)
    }

    # Ensure only raw data is included - ie weird headers are removed
    tmp <- trim2pk(db, tmp, table)

    tmp <- switch(table,
                  "lexicon" = clean_lexicon(db, tmp),
                  "corpus" = trim_table(tmp, remove_df),
                  "sources" = tmp)

    # write file to table
    dbWriteTable(db, name = table, value = tmp, append = T,
                 header = F)
  }

  if(!is.null(index)){
    for(i in index){
      dbExecute(db, paste0("CREATE INDEX ",i,"_index ON ",table," (",i,")"))
      cat("index --", paste0(i,"_index"), " -- created \n")
    }
  }

  cat("### ---- import of", length(filenames), "files into table **", table,
      "** complete ---- ###", "\n", "\n")
  }

In development was getting warnings: 1: In scan(file = file, what = what, sep = sep, quote = quote, dec = dec, : EOF within quoted string -> implemented solution found here

2: In scan(file = file, what = what, sep = sep, quote = quote, dec = dec, : embedded nul(s) found in input -> implemented solution found here

#' split_lexicon helper function:
#'
#' splits a table into clean and words to be removed
#' @details the fucntion currently removes NAs, punctutation marks and paragraph
#' indices and returns list o f length 2 containing:
#'  - a df containing the clean lexicon (tmp)
#' - a df of removed words (remove_df)
split_lexicon <- function(tmp){
  # remove NAs
  remove_df <- tmp[is.na(tmp$word),] %>%
    mutate(remove_call = "na")
  tmp <- trim_table(tmp, remove_df)

  # remove paragraph ids
  remove_df <- tmp[grep("^@{2}", tmp$word),] %>% mutate(remove_call = "paragraph_id") %>% bind_rows(remove_df)
  tmp <- trim_table(tmp, remove_df)

  #remove html tags
  #tmp <- tmp  %>% mutate(word = gsub("&amp;*", "---%%%---", word))
  #while(length(grep("---%%%---amp", tmp$word)) > 0){
    #tmp <- tmp  %>% mutate(word = gsub("---%%%---amp", "---%%%---",  word))
  #}
  #tags <- tmp[grep(".*&.*", tmp$word),"word"] %>% gsub("^.*&", "", .) %>%
    #gsub(";.*$", "", .) %>% unique()
  remove_df <- tmp[grep("&", tmp$word),] %>% mutate(remove_call = "html_tags") %>% bind_rows(remove_df)
  tmp <- trim_table(tmp, remove_df)
  remove_df <- tmp[grep("<.*>", tmp$word),] %>% mutate(remove_call = "html_tags") %>% bind_rows(remove_df)
  tmp <- trim_table(tmp, remove_df)
  remove_df <- tmp[grep(".*;TOOLONG", tmp$word),] %>% mutate(remove_call = "html_tags") %>% bind_rows(remove_df)
  tmp <- trim_table(tmp, remove_df)
  remove_df <- tmp[grep("^/", tmp$word),] %>% mutate(remove_call = "slash_start") %>% bind_rows(remove_df)
  tmp <- trim_table(tmp, remove_df)
  #tmp <- tmp  %>% mutate(word = gsub("---%%%---", "&", word))
  #
  # remove punctuations
  remove_df <- tmp[removePunctuation(tmp$word, preserve_intra_word_dashes = T) %>% stripWhitespace() == "",] %>%
    mutate(remove_call = "puntuations") %>% bind_rows(remove_df)
  tmp <- trim_table(tmp, remove_df)


  list(tmp = tmp, remove_df = remove_df)
}

#' trim helper function
#'
#' trim table of words marked to be removed in remove_df
trim_table <- function(tmp, remove_df){
  anti_join(tmp, remove_df, by = "wordID")
}
#' Clean lexicon
#'
#' trim table of words marked to be removed in remove_df
#' @param db database connection
#' @param tmp table to clean
#' @details the fucntion currently removes NAs, punctutation marks
#' @export
clean_lexicon <- function(db, tmp){

  out <- split_lexicon(tmp)

  dbWriteTable(db, name = "del_lexicon", value = out$remove_df, append = T,
               header = F)
  out$tmp
}

parameters

Assign the user source_dir and db_path (from default or supplied). Echo settings in log.

args <- commandArgs(trailingOnly = TRUE)

if(length(args) == 0) {
  user <- "ac1adk"}else{user <- args[1]}

if(length(args) >= 2) {source_dir <- args[2]}else{
  source_dir <- paste0("/shared/ooominds1/User/", user, "/a1517_now/")}

if(length(args) >= 3) {db_path <- args[3]}else{
  db_path <- paste0("/shared/ooominds1/User/", user, "/now_db")}

if(length(args) >= 4) {shared_db_path <- args[4]}else{
  shared_db_path <- "/shared/ooominds1/Shared/corpus.byu.edu/a1517_now/"}

cat("user:", user, "\n",
    "source_dir:", source_dir, "\n",
    "db_path:", db_path, "\n")

connect to db

db <- dbConnect(RSQLite::SQLite(), dbname = db_path)

sanity checks

mount_ooominds1_volume()
dbListFields(db, "corpus")
[1] "textID" "ID"     "wordID"
dbListFields(db, "lexicon")
[1] "wordID" "word"   "lemma"  "PoS"   
dbListFields(db, "sources")
[1] "textID"    "ID"        "date"      "country"   "source"    "url"      
[7] "textTitle"
dbListTables(db)
[1] "corpus"      "del_lexicon" "lexicon"     "sources"    

Populate db tables

lexicon

populate_nowdb(db, table = "lexicon", source_dir = paste0(source_dir, "shared"),
               pattern =  "dic.txt", index = "PoS")

sources

populate_nowdb(db, table = "sources", source_dir = paste0(source_dir, "shared"),
               pattern = "sources_pt.\\.txt", index = c("source", "date"))

corpus

populate_nowdb(db, table = "corpus", source_dir = paste0(source_dir, "db"),
               index = c("textID", "wordID"))

disconnect database

dbDisconnect(db)

copy to shared directory

file.copy(from = db_path, to = shared_db_path, overwrite = T)

This R Markdown site was created with workflowr