Functions in this Package

This vignette describes the functions that have not yet been described in other vignettes.

Provide Random Example Paths

set.seed(53653815)

(paths <- kwb.pathdict::random_paths(4, min_chars = 5, max_elements = 3))
#>  [1] "liquid/oxygen/sharp/experiment/continue.xls"
#>  [2] "liquid/oxygen/sharp/written.doc"            
#>  [3] "liquid/oxygen/sharp/phrase/discuss.R"       
#>  [4] "liquid/oxygen/sharp/phrase/occur.jpg"       
#>  [5] "liquid/oxygen/stand.doc"                    
#>  [6] "liquid/oxygen/train.png"                    
#>  [7] "liquid/electric.png"                        
#>  [8] "liquid/consider/industry/power.pdf"         
#>  [9] "liquid/consider/industry/quite/wheel.pdf"   
#> [10] "liquid/consider/industry/quite/drink.png"   
#> [11] "liquid/consider/child.png"

data_frame_to_paths()

The function data_frame_to_paths() converts a subdirectory matrix that has been converted to a data frame back to the paths by pasting all non-empty columns with slashes. Paths that do not have the maximum depth must have empty values "" (and not NA) in the last columns, otherwise the string NA will appear in the paths.

# Create subdirectory matrix as data frame
(subdir_data <- kwb.file::to_subdir_matrix(paths, result_type = "data.frame"))
#>        V1           V2        V3          V4           V5
#> 1  liquid       oxygen     sharp  experiment continue.xls
#> 2  liquid       oxygen     sharp written.doc             
#> 3  liquid       oxygen     sharp      phrase    discuss.R
#> 4  liquid       oxygen     sharp      phrase    occur.jpg
#> 5  liquid       oxygen stand.doc                         
#> 6  liquid       oxygen train.png                         
#> 7  liquid electric.png                                   
#> 8  liquid     consider  industry   power.pdf             
#> 9  liquid     consider  industry       quite    wheel.pdf
#> 10 liquid     consider  industry       quite    drink.png
#> 11 liquid     consider child.png

# Reconstruct the paths from this data frame
paths_reconstructed <- kwb.pathdict:::data_frame_to_paths(subdir_data)

# Check that the paths have not changed
identical(paths_reconstructed, paths)
#> [1] TRUE

to_placeholder(), is_placeholder(), to_dictionary_key()

The function to_placeholder() converts a string to a placeholder <string>.

kwb.pathdict:::to_placeholder("abc")
#> [1] "<abc>"

(placeholders <- kwb.pathdict:::to_placeholder(c("abc", 123)))
#> [1] "<abc>" "<123>"

The function is_placeholder() checks whether a string looks like a <placeholder>:

kwb.pathdict:::is_placeholder(placeholders)
#> [1] TRUE TRUE
kwb.pathdict:::is_placeholder("abc")
#> [1] FALSE
kwb.pathdict:::is_placeholder("[abc]")
#> [1] FALSE
kwb.pathdict:::is_placeholder(c("<abc>", "<def>", "ghi", "<jkl>"))
#> [1]  TRUE  TRUE FALSE  TRUE
kwb.pathdict:::is_placeholder("<abc>def")
#> [1] FALSE
kwb.pathdict:::is_placeholder("<abc><def>")
#> [1] FALSE
kwb.pathdict:::is_placeholder("<>")
#> [1] FALSE

The function to_dictionary_key creates the name for a key in a dictionary, consisting of a prefix (“p” by default) and a number, given in the hexadecimal system:

kwb.pathdict:::to_dictionary_key(1:17)
#>  [1] "p1"  "p2"  "p3"  "p4"  "p5"  "p6"  "p7"  "p8"  "p9"  "pA"  "pB"  "pC" 
#> [13] "pD"  "pE"  "pF"  "p10" "p11"
kwb.pathdict:::to_dictionary_key(1:17, leading_zeros = TRUE)
#>  [1] "p01" "p02" "p03" "p04" "p05" "p06" "p07" "p08" "p09" "p0A" "p0B" "p0C"
#> [13] "p0D" "p0E" "p0F" "p10" "p11"

to_dictionary(), use_dictionary()

The function to_dictionary() creates a “dictionary” for a given vector of input strings. Each unique input string is given a short name by which ich can be looked up in the returned dictionary. The dictionary is a list with the unique input strings as values and the assigned short names as keys. The entries in the dictionary are sorted decreasingly by the “importance” of the corresponding input string, i.e. the product of frequency and string length.

dict <- kwb.pathdict:::to_dictionary(dirname(paths))

# Show the dictionary in the form of a data frame
kwb.pathdict:::named_vector_to_data_frame(unlist(dict))
#>   name                          value
#> 1   a1 liquid/consider/industry/quite
#> 2   a2     liquid/oxygen/sharp/phrase
#> 3   a3 liquid/oxygen/sharp/experiment
#> 4   a4                  liquid/oxygen
#> 5   a5       liquid/consider/industry
#> 6   a6            liquid/oxygen/sharp
#> 7   a7                liquid/consider
#> 8   a8                         liquid

Once you have defined a dictionary you can replace the original strings with placeholders that correspond to the short names in the dictionary:

(short_paths <- kwb.pathdict:::use_dictionary(dirname(paths), dict))
#>  [1] "<a3>" "<a6>" "<a2>" "<a2>" "<a4>" "<a4>" "<a8>" "<a5>" "<a1>" "<a1>"
#> [11] "<a7>"

The short paths can be resolved back to the original paths using the resolve() function from the kwb.utils package:

# Resolve the placeholders in the short paths back to the original paths
long_paths <- kwb.utils::resolve(short_paths, dict)

# Check that the original paths were reproduced
identical(long_paths, dirname(paths))
#> [1] TRUE

get_dictionary_one_by_one()

result <- kwb.pathdict:::get_dictionary_one_by_one(paths, n = 5)
#> Splitting paths ... ok. (0.00s)

#> Distribution of path depths:
#> n_levels
#> 2 3 4 5 
#> 1 3 2 5 
#> i = 1, n = 11...
#> utils::head(y):
#> liquid 
#>     66 
#> i = 2, n = 11...
#> utils::head(y):
#> x
#>       liquid/oxygen     liquid/consider liquid/electric.png 
#>                  78                  60                  19 
#> i = 3, n = 10...
#> utils::head(y):
#> x
#>       liquid/oxygen/sharp  liquid/consider/industry liquid/consider/child.png 
#>                        76                        72                        25 
#>   liquid/oxygen/stand.doc   liquid/oxygen/train.png 
#>                        23                        23 
#> i = 4, n = 7...
#> utils::head(y):
#> x
#>     liquid/consider/industry/quite         liquid/oxygen/sharp/phrase 
#>                                 60                                 52 
#> liquid/consider/industry/power.pdf    liquid/oxygen/sharp/written.doc 
#>                                 34                                 31 
#>     liquid/oxygen/sharp/experiment 
#>                                 30 
#> i = 5, n = 5...
#> utils::head(y):
#> x
#> liquid/oxygen/sharp/experiment/continue.xls 
#>                                          43 
#>    liquid/consider/industry/quite/drink.png 
#>                                          40 
#>    liquid/consider/industry/quite/wheel.pdf 
#>                                          40 
#>        liquid/oxygen/sharp/phrase/discuss.R 
#>                                          36 
#>        liquid/oxygen/sharp/phrase/occur.jpg 
#>                                          36 
#>                                          path score length count score2
#> 1                         liquid/oxygen/sharp    76     19     4     60
#> 2                               liquid/oxygen    78     13     6     54
#> 3              liquid/consider/industry/quite    60     30     2     52
#> 4 liquid/oxygen/sharp/experiment/continue.xls    43     43     1     39
#> 5                                      liquid    66      6    11     22
#>  i key score count length                path score2
#>  1  p1    76     4     19 liquid/oxygen/sharp     60
#>                                          path score length count score2
#> 1                               liquid/oxygen    78     13     6     54
#> 2              liquid/consider/industry/quite    60     30     2     52
#> 3 liquid/oxygen/sharp/experiment/continue.xls    43     28     1     24
#> 4                                      liquid    66      6    11     22
#>  i key score count length          path score2
#>  2  p2    78     6     13 liquid/oxygen     54
#>                                          path score length count score2
#> 1              liquid/consider/industry/quite    60     30     2     52
#> 2                                      liquid    66      6    11     22
#> 3 liquid/oxygen/sharp/experiment/continue.xls    43     19     1     15
#>  i key score count length                           path score2
#>  3  p3    60     2     30 liquid/consider/industry/quite     52
#>                                          path score length count score2
#> 1                                      liquid    66      6    11     22
#> 2 liquid/oxygen/sharp/experiment/continue.xls    43     19     1     15
#>  i key score count length   path score2
#>  4  p4    66    11      6 liquid     22
#>                                          path score length count score2
#> 1 liquid/oxygen/sharp/experiment/continue.xls    43     17     1     13
#>  i key score count length                                        path score2
#>  5  p5    43     1     17 liquid/oxygen/sharp/experiment/continue.xls     13
result
#> $p1
#> [1] "liquid/oxygen/sharp"
#> 
#> $p2
#> [1] "liquid/oxygen"
#> 
#> $p3
#> [1] "liquid/consider/industry/quite"
#> 
#> $p4
#> [1] "liquid"
#> 
#> $p5
#> [1] "liquid/oxygen/sharp/experiment/continue.xls"

Subfolder Frequency Functions

At the start of get_dictionary_one_by_one() the function get_subdir_frequencies() is called. This function can be given a vector of path strings as input. For each possible depth in the path tree all different sub-paths are determined and ordered by their “importance”, i.e. by the product of frequency and path length in number of characters. By default, only the most important path per depth is returned:

frequencies <- kwb.pathdict:::get_subdir_frequencies(paths = paths, dbg = FALSE)
#> Splitting paths ... ok. (0.00s)

# Define helper function for output
print_frequencies <- function(f) {
  for (i in seq_along(f)) {
    message("=== depth: ", i)
    print(kwb.pathdict:::named_vector_to_data_frame(f[[i]]))
  }
}

print_frequencies(frequencies)
#> === depth: 1
#>     name value
#> 1 liquid    66
#> === depth: 2
#>            name value
#> 1 liquid/oxygen    78
#> === depth: 3
#>                  name value
#> 1 liquid/oxygen/sharp    76
#> === depth: 4
#>                             name value
#> 1 liquid/consider/industry/quite    60
#> === depth: 5
#>                                          name value
#> 1 liquid/oxygen/sharp/experiment/continue.xls    43

To return all different paths, set first.only = FALSE:

all_frequencies <- kwb.pathdict:::get_subdir_frequencies(
  paths = paths, first.only = FALSE, dbg = FALSE
)
#> Splitting paths ... ok. (0.00s)

print_frequencies(all_frequencies)
#> === depth: 1
#>     name value
#> 1 liquid    66
#> === depth: 2
#>                  name value
#> 1       liquid/oxygen    78
#> 2     liquid/consider    60
#> 3 liquid/electric.png    19
#> === depth: 3
#>                        name value
#> 1       liquid/oxygen/sharp    76
#> 2  liquid/consider/industry    72
#> 3 liquid/consider/child.png    25
#> 4   liquid/oxygen/stand.doc    23
#> 5   liquid/oxygen/train.png    23
#> === depth: 4
#>                                 name value
#> 1     liquid/consider/industry/quite    60
#> 2         liquid/oxygen/sharp/phrase    52
#> 3 liquid/consider/industry/power.pdf    34
#> 4    liquid/oxygen/sharp/written.doc    31
#> 5     liquid/oxygen/sharp/experiment    30
#> === depth: 5
#>                                          name value
#> 1 liquid/oxygen/sharp/experiment/continue.xls    43
#> 2    liquid/consider/industry/quite/drink.png    40
#> 3    liquid/consider/industry/quite/wheel.pdf    40
#> 4        liquid/oxygen/sharp/phrase/discuss.R    36
#> 5        liquid/oxygen/sharp/phrase/occur.jpg    36

The function to_frequency_data() is called next within get_dictionary_one_by_one(). It converts the list returned by get_subdir_frequencies() to a data frame.

# Convert frequency list to data frame
(frequency_data <- kwb.pathdict:::to_frequency_data(frequencies))
#>                                          path score length count
#> 1                               liquid/oxygen    78     13     6
#> 2                         liquid/oxygen/sharp    76     19     4
#> 3                                      liquid    66      6    11
#> 4              liquid/consider/industry/quite    60     30     2
#> 5 liquid/oxygen/sharp/experiment/continue.xls    43     43     1
(all_frequency_data <- kwb.pathdict:::to_frequency_data(all_frequencies))
#>                                           path score length count
#> 1                                liquid/oxygen    78     13     6
#> 2                          liquid/oxygen/sharp    76     19     4
#> 3                     liquid/consider/industry    72     24     3
#> 4                                       liquid    66      6    11
#> 5                              liquid/consider    60     15     4
#> 6               liquid/consider/industry/quite    60     30     2
#> 7                   liquid/oxygen/sharp/phrase    52     26     2
#> 8  liquid/oxygen/sharp/experiment/continue.xls    43     43     1
#> 9     liquid/consider/industry/quite/drink.png    40     40     1
#> 10    liquid/consider/industry/quite/wheel.pdf    40     40     1
#> 11        liquid/oxygen/sharp/phrase/discuss.R    36     36     1
#> 12        liquid/oxygen/sharp/phrase/occur.jpg    36     36     1
#> 13          liquid/consider/industry/power.pdf    34     34     1
#> 14             liquid/oxygen/sharp/written.doc    31     31     1
#> 15              liquid/oxygen/sharp/experiment    30     30     1
#> 16                   liquid/consider/child.png    25     25     1
#> 17                     liquid/oxygen/stand.doc    23     23     1
#> 18                     liquid/oxygen/train.png    23     23     1
#> 19                         liquid/electric.png    19     19     1

The function rescore_and_reorder_frequency_data() takes a data frame with columns length and count as input. It calculates score2 = (length - placeholder_size) * count and orders the data frame decreasingly by this score.

kwb.pathdict:::rescore_and_reorder_frequency_data(
  frequency_data, 
  # Length of placeholder string to be used instead of path
  placeholder_size = nchar("<p1>")
)
#>                                          path score length count score2
#> 1                         liquid/oxygen/sharp    76     19     4     60
#> 2                               liquid/oxygen    78     13     6     54
#> 3              liquid/consider/industry/quite    60     30     2     52
#> 4 liquid/oxygen/sharp/experiment/continue.xls    43     43     1     39
#> 5                                      liquid    66      6    11     22

get_next_level()

The function get_next_level() is a helper function that is called by the function compress_one_by_one().

kwb.pathdict:::get_next_level
#> function (x, key, set.attributes = FALSE, dbg = FALSE) 
#> {
#>     freqs <- get_subdir_frequencies(paths = x, dbg = dbg)
#>     allfreqs <- sort(unlist(freqs), decreasing = TRUE)
#>     dict <- structure(list(names(allfreqs[1])), names = key)
#>     result <- use_dictionary(x, dict, method = "part")
#>     if (set.attributes) {
#>         attr(result, "freqs") <- freqs
#>         attr(result, "dict") <- dict
#>     }
#>     result
#> }
#> <bytecode: 0x36e0c90>
#> <environment: namespace:kwb.pathdict>

get_subdirs_by_frequency()

The function get_subdirs_by_frequency() is used by a FAKIN script that is not yet part of a package. It gets different inputs:

subdirs: subdirectory matrix
cumid: cumulative identifier, created with kwb.pathdict:::to_cumulative_id()
freqinfo: one-row data frame with columns depth, n.x, n.Freq

subdirs <- kwb.file::to_subdir_matrix(paths)
cumid <- kwb.pathdict:::to_cumulative_id(subdirs)
#> depth: 00 1 2 3 4 5
freqinfo <- data.frame(depth = 2, n.x = 10, n.Freq = 10)

kwb.pathdict:::get_subdirs_by_frequency(subdirs, cumid, freqinfo)
#> freqinfo:
#>   depth n.x n.Freq
#> 1     2  10     10
#> [1] NA NA

Functions that still need to be described

kwb.pathdict:::log_result_if
#> function (dbg, x, y) 
#> {
#>     if (dbg) {
#>         kwb.utils::catLines(c("\n### x:", x))
#>         kwb.utils::catLines(c("\n### y:", y))
#>         cat("\n### str(dict):\n")
#>         utils::str(kwb.utils::getAttribute(y, "dict"))
#>     }
#> }
#> <bytecode: 0x1bb6368>
#> <environment: namespace:kwb.pathdict>
kwb.pathdict:::lookup_in_dictionary
#> function (x, dict) 
#> {
#>     ready <- x %in% to_placeholder(names(dict))
#>     out <- x
#>     out[!ready] <- to_placeholder(names(dict[match(x[!ready], 
#>         dict)]))
#>     out
#> }
#> <bytecode: 0x1b18718>
#> <environment: namespace:kwb.pathdict>
kwb.pathdict:::print_path_frequencies
#> function (x, maxchar = 80) 
#> {
#>     x$path <- substr(x$path, 1, maxchar)
#>     print(x)
#> }
#> <bytecode: 0x50a9a98>
#> <environment: namespace:kwb.pathdict>
kwb.pathdict:::replace_subdirs
#> function (s, r, p) 
#> {
#>     selected <- starts_with_parts(s, r)
#>     cols <- seq(length(r) + 1, ncol(s))
#>     fillright <- matrix(nrow = sum(selected), ncol = length(r) - 
#>         1)
#>     s[selected, ] <- cbind(p, s[selected, cols, drop = FALSE], 
#>         fillright)
#>     maxcol <- max(which(apply(s, 2, function(x) sum(!is.na(x))) > 
#>         0))
#>     s[, seq_len(maxcol)]
#> }
#> <bytecode: 0x46018c0>
#> <environment: namespace:kwb.pathdict>
kwb.pathdict:::starts_with_parts
#> function (parts, elements) 
#> {
#>     stopifnot(is.list(parts) || is.matrix(parts))
#>     stopifnot(all(!is.na(elements)))
#>     length_out <- if (is.list(parts)) 
#>         length(parts)
#>     else nrow(parts)
#>     selected_at_level <- lapply(seq_along(elements), function(i) {
#>         if (is.list(parts)) {
#>             sapply(parts, "[", i) == elements[i]
#>         }
#>         else {
#>             !is.na(parts[, i]) & (parts[, i] == elements[i])
#>         }
#>     })
#>     Reduce(`&`, selected_at_level, init = rep(TRUE, length_out))
#> }
#> <bytecode: 0x4394ba0>
#> <environment: namespace:kwb.pathdict>
kwb.pathdict:::update_frequency_data_length
#> function (frequency_data, winner, key) 
#> {
#>     get_column <- kwb.utils::selectColumns
#>     winner_length <- get_column(winner, "length")
#>     winner_path <- get_column(winner, "path")
#>     data_length <- get_column(frequency_data, "length")
#>     data_path <- get_column(frequency_data, "path")
#>     shortage <- winner_length - nchar(to_placeholder(key))
#>     matching <- (substr(data_path, 1, winner_length) == winner_path)
#>     frequency_data$length[matching] <- data_length[matching] - 
#>         shortage
#>     frequency_data
#> }
#> <bytecode: 0x517e1c8>
#> <environment: namespace:kwb.pathdict>

Hauke Sonnenberg

2020-01-10