a .zip file of texts containing a variety of differently encoded texts

A set of translations of the Universal Declaration of Human Rights, plus one or two other miscellaneous texts, for testing the text input functions that need to translate different input encodings.

Source

The Universal Declaration of Human Rights resources, https://www.un.org/en/about-us/universal-declaration-of-human-rights

Examples

if (FALSE) # unzip the files to a temporary directory
FILEDIR <- tempdir()
unzip(system.file("extdata", "data_files_encodedtexts.zip", package = "readtext"), 
      exdir = FILEDIR)
#> Error: object 'FILEDIR' not found

# get encoding from filename
filenames <- list.files(FILEDIR, "\\.txt$")
#> Error: object 'FILEDIR' not found
# strip the extension
filenames <- gsub(".txt$", "", filenames)
#> Error: object 'filenames' not found
parts <- strsplit(filenames, "_")
#> Error: object 'filenames' not found
fileencodings <- sapply(parts, "[", 3)
#> Error: object 'parts' not found
fileencodings
#> Error: object 'fileencodings' not found

# find out which conversions are unavailable (through iconv())
cat("Encoding conversions not available for this platform:")
#> Encoding conversions not available for this platform:
notAvailableIndex <- which(!(fileencodings %in% iconvlist()))
#> Error: object 'fileencodings' not found
fileencodings[notAvailableIndex]
#> Error: object 'fileencodings' not found

# try readtext
require(quanteda)
#> Loading required package: quanteda
#> Package version: 4.3.1
#> Unicode version: 14.0
#> ICU version: 71.1
#> Parallel computing: 10 of 10 threads used.
#> See https://quanteda.io for tutorials and examples.
#> 
#> Attaching package: ‘quanteda’
#> The following object is masked from ‘package:readtext’:
#> 
#>     texts
txts <- readtext(paste0(FILEDIR, "/", "*.txt"))
#> Error: object 'FILEDIR' not found
substring(texts(txts)[1], 1, 80) # gibberish
#> Error: object 'txts' not found
substring(texts(txts)[4], 1, 80) # hex
#> Error: object 'txts' not found
substring(texts(txts)[40], 1, 80) # hex
#> Error: object 'txts' not found

# read them in again
txts <- readtext(paste0(FILEDIR,  "/", "*.txt"), encoding = fileencodings)
#> Error: object 'FILEDIR' not found
substring(texts(txts)[1], 1, 80)  # English
#> Error: object 'txts' not found
substring(texts(txts)[4], 1, 80)  # Arabic, looking good 
#> Error: object 'txts' not found
substring(texts(txts)[40], 1, 80) # Cyrillic, looking good
#> Error: object 'txts' not found
substring(texts(txts)[7], 1, 80)  # Chinese, looking good
#> Error: object 'txts' not found
substring(texts(txts)[26], 1, 80) # Hindi, looking good
#> Error: object 'txts' not found

txts <- readtext(paste0(FILEDIR, "/", "*.txt"), encoding = fileencodings,
                  docvarsfrom = "filenames", 
                  docvarnames = c("document", "language", "inputEncoding"))
#> Error: object 'FILEDIR' not found
encodingCorpus <- corpus(txts, source = "Created by encoding-tests.R") 
#> Error: object 'txts' not found
summary(encodingCorpus)
#> Error: object 'encodingCorpus' not found
 # \dontrun{}