A set of translations of the Universal Declaration of Human Rights, plus one or two other miscellaneous texts, for testing the text input functions that need to translate different input encodings.

Source

The Universal Declaration of Human Rights resources, http://www.ohchr.org/EN/UDHR/Pages/SearchByLang.aspx

Examples

if (FALSE) # unzip the files to a temporary directory FILEDIR <- tempdir() unzip(system.file("extdata", "data_files_encodedtexts.zip", package = "readtext"), exdir = FILEDIR)
#> Error in dir.create(exdir, showWarnings = FALSE, recursive = TRUE): object 'FILEDIR' not found
# get encoding from filename filenames <- list.files(FILEDIR, "\\.txt$")
#> Error in list.files(FILEDIR, "\\.txt$"): object 'FILEDIR' not found
# strip the extension filenames <- gsub(".txt$", "", filenames)
#> Error in gsub(".txt$", "", filenames): object 'filenames' not found
parts <- strsplit(filenames, "_")
#> Error in strsplit(filenames, "_"): object 'filenames' not found
fileencodings <- sapply(parts, "[", 3)
#> Error in lapply(X = X, FUN = FUN, ...): object 'parts' not found
fileencodings
#> Error in eval(expr, envir, enclos): object 'fileencodings' not found
# find out which conversions are unavailable (through iconv()) cat("Encoding conversions not available for this platform:")
#> Encoding conversions not available for this platform:
notAvailableIndex <- which(!(fileencodings %in% iconvlist()))
#> Error in fileencodings %in% iconvlist(): object 'fileencodings' not found
fileencodings[notAvailableIndex]
#> Error in eval(expr, envir, enclos): object 'fileencodings' not found
# try readtext require(quanteda)
#> Loading required package: quanteda
#> Package version: 2.0.0.9000
#> Parallel computing: 2 of 12 threads used.
#> See https://quanteda.io for tutorials and examples.
#> #> Attaching package: ‘quanteda’
#> The following object is masked from ‘package:utils’: #> #> View
txts <- readtext(paste0(FILEDIR, "/", "*.txt"))
#> Error in paste0(FILEDIR, "/", "*.txt"): object 'FILEDIR' not found
substring(texts(txts)[1], 1, 80) # gibberish
#> Error in texts(txts): object 'txts' not found
substring(texts(txts)[4], 1, 80) # hex
#> Error in texts(txts): object 'txts' not found
substring(texts(txts)[40], 1, 80) # hex
#> Error in texts(txts): object 'txts' not found
# read them in again txts <- readtext(paste0(FILEDIR, "/", "*.txt"), encoding = fileencodings)
#> Error in paste0(FILEDIR, "/", "*.txt"): object 'FILEDIR' not found
substring(texts(txts)[1], 1, 80) # English
#> Error in texts(txts): object 'txts' not found
substring(texts(txts)[4], 1, 80) # Arabic, looking good
#> Error in texts(txts): object 'txts' not found
substring(texts(txts)[40], 1, 80) # Cyrillic, looking good
#> Error in texts(txts): object 'txts' not found
substring(texts(txts)[7], 1, 80) # Chinese, looking good
#> Error in texts(txts): object 'txts' not found
substring(texts(txts)[26], 1, 80) # Hindi, looking good
#> Error in texts(txts): object 'txts' not found
txts <- readtext(paste0(FILEDIR, "/", "*.txt"), encoding = fileencodings, docvarsfrom = "filenames", docvarnames = c("document", "language", "inputEncoding"))
#> Error in paste0(FILEDIR, "/", "*.txt"): object 'FILEDIR' not found
encodingCorpus <- corpus(txts, source = "Created by encoding-tests.R")
#> Error in corpus(txts, source = "Created by encoding-tests.R"): object 'txts' not found
summary(encodingCorpus)
#> Error in summary(encodingCorpus): object 'encodingCorpus' not found