R/data.R
data_files_encodedtexts.Rd
A set of translations of the Universal Declaration of Human Rights, plus one or two other miscellaneous texts, for testing the text input functions that need to translate different input encodings.
The Universal Declaration of Human Rights resources, https://www.un.org/en/about-us/universal-declaration-of-human-rights
if (FALSE) # unzip the files to a temporary directory
FILEDIR <- tempdir()
unzip(system.file("extdata", "data_files_encodedtexts.zip", package = "readtext"),
exdir = FILEDIR)
#> Error in eval(expr, envir, enclos): object 'FILEDIR' not found
# get encoding from filename
filenames <- list.files(FILEDIR, "\\.txt$")
#> Error in eval(expr, envir, enclos): object 'FILEDIR' not found
# strip the extension
filenames <- gsub(".txt$", "", filenames)
#> Error in eval(expr, envir, enclos): object 'filenames' not found
parts <- strsplit(filenames, "_")
#> Error in eval(expr, envir, enclos): object 'filenames' not found
fileencodings <- sapply(parts, "[", 3)
#> Error in eval(expr, envir, enclos): object 'parts' not found
fileencodings
#> Error in eval(expr, envir, enclos): object 'fileencodings' not found
# find out which conversions are unavailable (through iconv())
cat("Encoding conversions not available for this platform:")
#> Encoding conversions not available for this platform:
notAvailableIndex <- which(!(fileencodings %in% iconvlist()))
#> Error in eval(expr, envir, enclos): object 'fileencodings' not found
fileencodings[notAvailableIndex]
#> Error in eval(expr, envir, enclos): object 'fileencodings' not found
# try readtext
require(quanteda)
#> Loading required package: quanteda
#> Package version: 3.3.1
#> Unicode version: 14.0
#> ICU version: 71.1
#> Parallel computing: 10 of 10 threads used.
#> See https://quanteda.io for tutorials and examples.
#>
#> Attaching package: ‘quanteda’
#> The following object is masked from ‘package:readtext’:
#>
#> texts
txts <- readtext(paste0(FILEDIR, "/", "*.txt"))
#> Error in eval(expr, envir, enclos): object 'FILEDIR' not found
substring(texts(txts)[1], 1, 80) # gibberish
#> Error in eval(expr, envir, enclos): object 'txts' not found
substring(texts(txts)[4], 1, 80) # hex
#> Error in eval(expr, envir, enclos): object 'txts' not found
substring(texts(txts)[40], 1, 80) # hex
#> Error in eval(expr, envir, enclos): object 'txts' not found
# read them in again
txts <- readtext(paste0(FILEDIR, "/", "*.txt"), encoding = fileencodings)
#> Error in eval(expr, envir, enclos): object 'FILEDIR' not found
substring(texts(txts)[1], 1, 80) # English
#> Error in eval(expr, envir, enclos): object 'txts' not found
substring(texts(txts)[4], 1, 80) # Arabic, looking good
#> Error in eval(expr, envir, enclos): object 'txts' not found
substring(texts(txts)[40], 1, 80) # Cyrillic, looking good
#> Error in eval(expr, envir, enclos): object 'txts' not found
substring(texts(txts)[7], 1, 80) # Chinese, looking good
#> Error in eval(expr, envir, enclos): object 'txts' not found
substring(texts(txts)[26], 1, 80) # Hindi, looking good
#> Error in eval(expr, envir, enclos): object 'txts' not found
txts <- readtext(paste0(FILEDIR, "/", "*.txt"), encoding = fileencodings,
docvarsfrom = "filenames",
docvarnames = c("document", "language", "inputEncoding"))
#> Error in eval(expr, envir, enclos): object 'FILEDIR' not found
encodingCorpus <- corpus(txts, source = "Created by encoding-tests.R")
#> Error in eval(expr, envir, enclos): object 'txts' not found
summary(encodingCorpus)
#> Error in eval(expr, envir, enclos): object 'encodingCorpus' not found