R/data.R
data_files_encodedtexts.Rd
A set of translations of the Universal Declaration of Human Rights, plus one or two other miscellaneous texts, for testing the text input functions that need to translate different input encodings.
The Universal Declaration of Human Rights resources, https://www.un.org/en/about-us/universal-declaration-of-human-rights
if (FALSE) # unzip the files to a temporary directory
FILEDIR <- tempdir()
unzip(system.file("extdata", "data_files_encodedtexts.zip", package = "readtext"),
exdir = FILEDIR)
#> Error in dir.create(exdir, showWarnings = FALSE, recursive = TRUE): object 'FILEDIR' not found
# get encoding from filename
filenames <- list.files(FILEDIR, "\\.txt$")
#> Error in list.files(FILEDIR, "\\.txt$"): object 'FILEDIR' not found
# strip the extension
filenames <- gsub(".txt$", "", filenames)
#> Error in is.factor(x): object 'filenames' not found
parts <- strsplit(filenames, "_")
#> Error in strsplit(filenames, "_"): object 'filenames' not found
fileencodings <- sapply(parts, "[", 3)
#> Error in lapply(X = X, FUN = FUN, ...): object 'parts' not found
fileencodings
#> Error in eval(expr, envir, enclos): object 'fileencodings' not found
# find out which conversions are unavailable (through iconv())
cat("Encoding conversions not available for this platform:")
#> Encoding conversions not available for this platform:
notAvailableIndex <- which(!(fileencodings %in% iconvlist()))
#> Error in fileencodings %in% iconvlist(): object 'fileencodings' not found
fileencodings[notAvailableIndex]
#> Error in eval(expr, envir, enclos): object 'fileencodings' not found
# try readtext
require(quanteda)
#> Loading required package: quanteda
#> Package version: 3.2.4
#> Unicode version: 14.0
#> ICU version: 71.1
#> Parallel computing: 10 of 10 threads used.
#> See https://quanteda.io for tutorials and examples.
#>
#> Attaching package: ‘quanteda’
#> The following objects are masked from ‘package:readtext’:
#>
#> docnames, docvars, texts
txts <- readtext(paste0(FILEDIR, "/", "*.txt"))
#> Error in paste0(FILEDIR, "/", "*.txt"): object 'FILEDIR' not found
substring(texts(txts)[1], 1, 80) # gibberish
#> Error in texts(txts): object 'txts' not found
substring(texts(txts)[4], 1, 80) # hex
#> Error in texts(txts): object 'txts' not found
substring(texts(txts)[40], 1, 80) # hex
#> Error in texts(txts): object 'txts' not found
# read them in again
txts <- readtext(paste0(FILEDIR, "/", "*.txt"), encoding = fileencodings)
#> Error in paste0(FILEDIR, "/", "*.txt"): object 'FILEDIR' not found
substring(texts(txts)[1], 1, 80) # English
#> Error in texts(txts): object 'txts' not found
substring(texts(txts)[4], 1, 80) # Arabic, looking good
#> Error in texts(txts): object 'txts' not found
substring(texts(txts)[40], 1, 80) # Cyrillic, looking good
#> Error in texts(txts): object 'txts' not found
substring(texts(txts)[7], 1, 80) # Chinese, looking good
#> Error in texts(txts): object 'txts' not found
substring(texts(txts)[26], 1, 80) # Hindi, looking good
#> Error in texts(txts): object 'txts' not found
txts <- readtext(paste0(FILEDIR, "/", "*.txt"), encoding = fileencodings,
docvarsfrom = "filenames",
docvarnames = c("document", "language", "inputEncoding"))
#> Error in paste0(FILEDIR, "/", "*.txt"): object 'FILEDIR' not found
encodingCorpus <- corpus(txts, source = "Created by encoding-tests.R")
#> Error in corpus(txts, source = "Created by encoding-tests.R"): object 'txts' not found
summary(encodingCorpus)
#> Error in summary(encodingCorpus): object 'encodingCorpus' not found