This vignette describes the functions that have not yet been described in other vignettes.
set.seed(53653815)
(paths <- kwb.pathdict::random_paths(4, min_chars = 5, max_elements = 3))
#> [1] "liquid/oxygen/sharp/experiment/continue.xls"
#> [2] "liquid/oxygen/sharp/written.doc"
#> [3] "liquid/oxygen/sharp/phrase/discuss.R"
#> [4] "liquid/oxygen/sharp/phrase/occur.jpg"
#> [5] "liquid/oxygen/stand.doc"
#> [6] "liquid/oxygen/train.png"
#> [7] "liquid/electric.png"
#> [8] "liquid/consider/industry/power.pdf"
#> [9] "liquid/consider/industry/quite/wheel.pdf"
#> [10] "liquid/consider/industry/quite/drink.png"
#> [11] "liquid/consider/child.png"
The function data_frame_to_paths()
converts a subdirectory matrix that has been converted to a data frame back to the paths by pasting all non-empty columns with slashes. Paths that do not have the maximum depth must have empty values ""
(and not NA
) in the last columns, otherwise the string NA
will appear in the paths.
# Create subdirectory matrix as data frame
(subdir_data <- kwb.file::to_subdir_matrix(paths, result_type = "data.frame"))
#> V1 V2 V3 V4 V5
#> 1 liquid oxygen sharp experiment continue.xls
#> 2 liquid oxygen sharp written.doc
#> 3 liquid oxygen sharp phrase discuss.R
#> 4 liquid oxygen sharp phrase occur.jpg
#> 5 liquid oxygen stand.doc
#> 6 liquid oxygen train.png
#> 7 liquid electric.png
#> 8 liquid consider industry power.pdf
#> 9 liquid consider industry quite wheel.pdf
#> 10 liquid consider industry quite drink.png
#> 11 liquid consider child.png
# Reconstruct the paths from this data frame
paths_reconstructed <- kwb.pathdict:::data_frame_to_paths(subdir_data)
# Check that the paths have not changed
identical(paths_reconstructed, paths)
#> [1] TRUE
The function to_placeholder()
converts a string to a placeholder <string>.
kwb.pathdict:::to_placeholder("abc")
#> [1] "<abc>"
(placeholders <- kwb.pathdict:::to_placeholder(c("abc", 123)))
#> [1] "<abc>" "<123>"
The function is_placeholder()
checks whether a string looks like a <placeholder>:
kwb.pathdict:::is_placeholder(placeholders)
#> [1] TRUE TRUE
kwb.pathdict:::is_placeholder("abc")
#> [1] FALSE
kwb.pathdict:::is_placeholder("[abc]")
#> [1] FALSE
kwb.pathdict:::is_placeholder(c("<abc>", "<def>", "ghi", "<jkl>"))
#> [1] TRUE TRUE FALSE TRUE
kwb.pathdict:::is_placeholder("<abc>def")
#> [1] FALSE
kwb.pathdict:::is_placeholder("<abc><def>")
#> [1] FALSE
kwb.pathdict:::is_placeholder("<>")
#> [1] FALSE
The function to_dictionary_key
creates the name for a key in a dictionary, consisting of a prefix (“p” by default) and a number, given in the hexadecimal system:
kwb.pathdict:::to_dictionary_key(1:17)
#> [1] "p1" "p2" "p3" "p4" "p5" "p6" "p7" "p8" "p9" "pA" "pB" "pC"
#> [13] "pD" "pE" "pF" "p10" "p11"
kwb.pathdict:::to_dictionary_key(1:17, leading_zeros = TRUE)
#> [1] "p01" "p02" "p03" "p04" "p05" "p06" "p07" "p08" "p09" "p0A" "p0B" "p0C"
#> [13] "p0D" "p0E" "p0F" "p10" "p11"
The function to_dictionary()
creates a “dictionary” for a given vector of input strings. Each unique input string is given a short name by which ich can be looked up in the returned dictionary. The dictionary is a list with the unique input strings as values and the assigned short names as keys. The entries in the dictionary are sorted decreasingly by the “importance” of the corresponding input string, i.e. the product of frequency and string length.
dict <- kwb.pathdict:::to_dictionary(dirname(paths))
# Show the dictionary in the form of a data frame
kwb.pathdict:::named_vector_to_data_frame(unlist(dict))
#> name value
#> 1 a1 liquid/consider/industry/quite
#> 2 a2 liquid/oxygen/sharp/phrase
#> 3 a3 liquid/oxygen/sharp/experiment
#> 4 a4 liquid/oxygen
#> 5 a5 liquid/consider/industry
#> 6 a6 liquid/oxygen/sharp
#> 7 a7 liquid/consider
#> 8 a8 liquid
Once you have defined a dictionary you can replace the original strings with placeholders that correspond to the short names in the dictionary:
(short_paths <- kwb.pathdict:::use_dictionary(dirname(paths), dict))
#> [1] "<a3>" "<a6>" "<a2>" "<a2>" "<a4>" "<a4>" "<a8>" "<a5>" "<a1>" "<a1>"
#> [11] "<a7>"
The short paths can be resolved back to the original paths using the resolve()
function from the kwb.utils package:
#> Distribution of path depths:
#> n_levels
#> 2 3 4 5
#> 1 3 2 5
#> i = 1, n = 11...
#> utils::head(y):
#> liquid
#> 66
#> i = 2, n = 11...
#> utils::head(y):
#> x
#> liquid/oxygen liquid/consider liquid/electric.png
#> 78 60 19
#> i = 3, n = 10...
#> utils::head(y):
#> x
#> liquid/oxygen/sharp liquid/consider/industry liquid/consider/child.png
#> 76 72 25
#> liquid/oxygen/stand.doc liquid/oxygen/train.png
#> 23 23
#> i = 4, n = 7...
#> utils::head(y):
#> x
#> liquid/consider/industry/quite liquid/oxygen/sharp/phrase
#> 60 52
#> liquid/consider/industry/power.pdf liquid/oxygen/sharp/written.doc
#> 34 31
#> liquid/oxygen/sharp/experiment
#> 30
#> i = 5, n = 5...
#> utils::head(y):
#> x
#> liquid/oxygen/sharp/experiment/continue.xls
#> 43
#> liquid/consider/industry/quite/drink.png
#> 40
#> liquid/consider/industry/quite/wheel.pdf
#> 40
#> liquid/oxygen/sharp/phrase/discuss.R
#> 36
#> liquid/oxygen/sharp/phrase/occur.jpg
#> 36
#> path score length count score2
#> 1 liquid/oxygen/sharp 76 19 4 60
#> 2 liquid/oxygen 78 13 6 54
#> 3 liquid/consider/industry/quite 60 30 2 52
#> 4 liquid/oxygen/sharp/experiment/continue.xls 43 43 1 39
#> 5 liquid 66 6 11 22
#> i key score count length path score2
#> 1 p1 76 4 19 liquid/oxygen/sharp 60
#> path score length count score2
#> 1 liquid/oxygen 78 13 6 54
#> 2 liquid/consider/industry/quite 60 30 2 52
#> 3 liquid/oxygen/sharp/experiment/continue.xls 43 28 1 24
#> 4 liquid 66 6 11 22
#> i key score count length path score2
#> 2 p2 78 6 13 liquid/oxygen 54
#> path score length count score2
#> 1 liquid/consider/industry/quite 60 30 2 52
#> 2 liquid 66 6 11 22
#> 3 liquid/oxygen/sharp/experiment/continue.xls 43 19 1 15
#> i key score count length path score2
#> 3 p3 60 2 30 liquid/consider/industry/quite 52
#> path score length count score2
#> 1 liquid 66 6 11 22
#> 2 liquid/oxygen/sharp/experiment/continue.xls 43 19 1 15
#> i key score count length path score2
#> 4 p4 66 11 6 liquid 22
#> path score length count score2
#> 1 liquid/oxygen/sharp/experiment/continue.xls 43 17 1 13
#> i key score count length path score2
#> 5 p5 43 1 17 liquid/oxygen/sharp/experiment/continue.xls 13
result
#> $p1
#> [1] "liquid/oxygen/sharp"
#>
#> $p2
#> [1] "liquid/oxygen"
#>
#> $p3
#> [1] "liquid/consider/industry/quite"
#>
#> $p4
#> [1] "liquid"
#>
#> $p5
#> [1] "liquid/oxygen/sharp/experiment/continue.xls"
At the start of get_dictionary_one_by_one()
the function get_subdir_frequencies()
is called. This function can be given a vector of path strings as input. For each possible depth in the path tree all different sub-paths are determined and ordered by their “importance”, i.e. by the product of frequency and path length in number of characters. By default, only the most important path per depth is returned:
frequencies <- kwb.pathdict:::get_subdir_frequencies(paths = paths, dbg = FALSE)
#> Splitting paths ... ok. (0.00s)
# Define helper function for output
print_frequencies <- function(f) {
for (i in seq_along(f)) {
message("=== depth: ", i)
print(kwb.pathdict:::named_vector_to_data_frame(f[[i]]))
}
}
print_frequencies(frequencies)
#> === depth: 1
#> name value
#> 1 liquid 66
#> === depth: 2
#> name value
#> 1 liquid/oxygen 78
#> === depth: 3
#> name value
#> 1 liquid/oxygen/sharp 76
#> === depth: 4
#> name value
#> 1 liquid/consider/industry/quite 60
#> === depth: 5
#> name value
#> 1 liquid/oxygen/sharp/experiment/continue.xls 43
To return all different paths, set first.only = FALSE
:
all_frequencies <- kwb.pathdict:::get_subdir_frequencies(
paths = paths, first.only = FALSE, dbg = FALSE
)
#> Splitting paths ... ok. (0.00s)
print_frequencies(all_frequencies)
#> === depth: 1
#> name value
#> 1 liquid 66
#> === depth: 2
#> name value
#> 1 liquid/oxygen 78
#> 2 liquid/consider 60
#> 3 liquid/electric.png 19
#> === depth: 3
#> name value
#> 1 liquid/oxygen/sharp 76
#> 2 liquid/consider/industry 72
#> 3 liquid/consider/child.png 25
#> 4 liquid/oxygen/stand.doc 23
#> 5 liquid/oxygen/train.png 23
#> === depth: 4
#> name value
#> 1 liquid/consider/industry/quite 60
#> 2 liquid/oxygen/sharp/phrase 52
#> 3 liquid/consider/industry/power.pdf 34
#> 4 liquid/oxygen/sharp/written.doc 31
#> 5 liquid/oxygen/sharp/experiment 30
#> === depth: 5
#> name value
#> 1 liquid/oxygen/sharp/experiment/continue.xls 43
#> 2 liquid/consider/industry/quite/drink.png 40
#> 3 liquid/consider/industry/quite/wheel.pdf 40
#> 4 liquid/oxygen/sharp/phrase/discuss.R 36
#> 5 liquid/oxygen/sharp/phrase/occur.jpg 36
The function to_frequency_data()
is called next within get_dictionary_one_by_one()
. It converts the list returned by get_subdir_frequencies()
to a data frame.
# Convert frequency list to data frame
(frequency_data <- kwb.pathdict:::to_frequency_data(frequencies))
#> path score length count
#> 1 liquid/oxygen 78 13 6
#> 2 liquid/oxygen/sharp 76 19 4
#> 3 liquid 66 6 11
#> 4 liquid/consider/industry/quite 60 30 2
#> 5 liquid/oxygen/sharp/experiment/continue.xls 43 43 1
(all_frequency_data <- kwb.pathdict:::to_frequency_data(all_frequencies))
#> path score length count
#> 1 liquid/oxygen 78 13 6
#> 2 liquid/oxygen/sharp 76 19 4
#> 3 liquid/consider/industry 72 24 3
#> 4 liquid 66 6 11
#> 5 liquid/consider 60 15 4
#> 6 liquid/consider/industry/quite 60 30 2
#> 7 liquid/oxygen/sharp/phrase 52 26 2
#> 8 liquid/oxygen/sharp/experiment/continue.xls 43 43 1
#> 9 liquid/consider/industry/quite/drink.png 40 40 1
#> 10 liquid/consider/industry/quite/wheel.pdf 40 40 1
#> 11 liquid/oxygen/sharp/phrase/discuss.R 36 36 1
#> 12 liquid/oxygen/sharp/phrase/occur.jpg 36 36 1
#> 13 liquid/consider/industry/power.pdf 34 34 1
#> 14 liquid/oxygen/sharp/written.doc 31 31 1
#> 15 liquid/oxygen/sharp/experiment 30 30 1
#> 16 liquid/consider/child.png 25 25 1
#> 17 liquid/oxygen/stand.doc 23 23 1
#> 18 liquid/oxygen/train.png 23 23 1
#> 19 liquid/electric.png 19 19 1
The function rescore_and_reorder_frequency_data()
takes a data frame with columns length
and count
as input. It calculates score2 = (length - placeholder_size) * count
and orders the data frame decreasingly by this score.
kwb.pathdict:::rescore_and_reorder_frequency_data(
frequency_data,
# Length of placeholder string to be used instead of path
placeholder_size = nchar("<p1>")
)
#> path score length count score2
#> 1 liquid/oxygen/sharp 76 19 4 60
#> 2 liquid/oxygen 78 13 6 54
#> 3 liquid/consider/industry/quite 60 30 2 52
#> 4 liquid/oxygen/sharp/experiment/continue.xls 43 43 1 39
#> 5 liquid 66 6 11 22
The function get_next_level()
is a helper function that is called by the function compress_one_by_one()
.
kwb.pathdict:::get_next_level
#> function (x, key, set.attributes = FALSE, dbg = FALSE)
#> {
#> freqs <- get_subdir_frequencies(paths = x, dbg = dbg)
#> allfreqs <- sort(unlist(freqs), decreasing = TRUE)
#> dict <- structure(list(names(allfreqs[1])), names = key)
#> result <- use_dictionary(x, dict, method = "part")
#> if (set.attributes) {
#> attr(result, "freqs") <- freqs
#> attr(result, "dict") <- dict
#> }
#> result
#> }
#> <bytecode: 0x36e0c90>
#> <environment: namespace:kwb.pathdict>
The function get_subdirs_by_frequency()
is used by a FAKIN script that is not yet part of a package. It gets different inputs:
subdirs
: subdirectory matrixcumid
: cumulative identifier, created with kwb.pathdict:::to_cumulative_id()
freqinfo
: one-row data frame with columns depth
, n.x
, n.Freq
kwb.pathdict:::log_result_if
#> function (dbg, x, y)
#> {
#> if (dbg) {
#> kwb.utils::catLines(c("\n### x:", x))
#> kwb.utils::catLines(c("\n### y:", y))
#> cat("\n### str(dict):\n")
#> utils::str(kwb.utils::getAttribute(y, "dict"))
#> }
#> }
#> <bytecode: 0x1bb6368>
#> <environment: namespace:kwb.pathdict>
kwb.pathdict:::lookup_in_dictionary
#> function (x, dict)
#> {
#> ready <- x %in% to_placeholder(names(dict))
#> out <- x
#> out[!ready] <- to_placeholder(names(dict[match(x[!ready],
#> dict)]))
#> out
#> }
#> <bytecode: 0x1b18718>
#> <environment: namespace:kwb.pathdict>
kwb.pathdict:::print_path_frequencies
#> function (x, maxchar = 80)
#> {
#> x$path <- substr(x$path, 1, maxchar)
#> print(x)
#> }
#> <bytecode: 0x50a9a98>
#> <environment: namespace:kwb.pathdict>
kwb.pathdict:::replace_subdirs
#> function (s, r, p)
#> {
#> selected <- starts_with_parts(s, r)
#> cols <- seq(length(r) + 1, ncol(s))
#> fillright <- matrix(nrow = sum(selected), ncol = length(r) -
#> 1)
#> s[selected, ] <- cbind(p, s[selected, cols, drop = FALSE],
#> fillright)
#> maxcol <- max(which(apply(s, 2, function(x) sum(!is.na(x))) >
#> 0))
#> s[, seq_len(maxcol)]
#> }
#> <bytecode: 0x46018c0>
#> <environment: namespace:kwb.pathdict>
kwb.pathdict:::starts_with_parts
#> function (parts, elements)
#> {
#> stopifnot(is.list(parts) || is.matrix(parts))
#> stopifnot(all(!is.na(elements)))
#> length_out <- if (is.list(parts))
#> length(parts)
#> else nrow(parts)
#> selected_at_level <- lapply(seq_along(elements), function(i) {
#> if (is.list(parts)) {
#> sapply(parts, "[", i) == elements[i]
#> }
#> else {
#> !is.na(parts[, i]) & (parts[, i] == elements[i])
#> }
#> })
#> Reduce(`&`, selected_at_level, init = rep(TRUE, length_out))
#> }
#> <bytecode: 0x4394ba0>
#> <environment: namespace:kwb.pathdict>
kwb.pathdict:::update_frequency_data_length
#> function (frequency_data, winner, key)
#> {
#> get_column <- kwb.utils::selectColumns
#> winner_length <- get_column(winner, "length")
#> winner_path <- get_column(winner, "path")
#> data_length <- get_column(frequency_data, "length")
#> data_path <- get_column(frequency_data, "path")
#> shortage <- winner_length - nchar(to_placeholder(key))
#> matching <- (substr(data_path, 1, winner_length) == winner_path)
#> frequency_data$length[matching] <- data_length[matching] -
#> shortage
#> frequency_data
#> }
#> <bytecode: 0x517e1c8>
#> <environment: namespace:kwb.pathdict>