Provide some example R scripts locally
In order to test the functions of this package some R scripts are downloaded from GitHub and provided locally:
`%>%` <- magrittr::`%>%`
# Create a temporary folder
root <- kwb.utils::tempSubdirectory("test")
# Function to download a script file from a KWB package on GitHub
download_kwb_script <- function(repo, script)
{
url <- sprintf(
"https://raw.githubusercontent.com/%s/master/R/%s",
repo, script
)
destfile <- file.path(root, basename(url))
download.file(url, destfile = destfile, mode = "wt")
destfile
}
# Download three scripts to the temporary folder
scripts <- c(
download_kwb_script("KWB-R/kwb.utils", "log.R"),
download_kwb_script("KWB-R/kwb.utils", "main.R"),
download_kwb_script("KWB-R/kwb.fakin", "plot_file_distribution.R")
)
Exported functions
analyse()
The parse tree is analysed. Each node of the tree is given the following attributes:
- type
- mode
- class
- length
- text
- is
- n_modes
- n_classes
- path
- fulltype
- children
The idea probably was to use these information to extract objects of special interest from the parse tree (see below: get_elements_by_type())
arg_names()
This function returns the names of the arguments of a function:
kwb.code::arg_names(kwb.utils::selectColumns)
#> [1] "x" "columns" "pattern" "drop" "do.stop"
find_string_constants()
This function requires a directory of R scripts. All scripts are parsed. String constants that are used in the script are returned.
string_constants <- kwb.code::find_string_constants(root)
#> Splitting paths ... ok. (0.00 secs)
#> Removing the first 8 path segments ... ok. (0.00 secs)
#> Reading /var/folders/3m/p59k4qdj0f17st0gn2cmj3640000gn/T//RtmpppsUPZ/test/log.R ... ok. (0.00 secs)
#> Reading /var/folders/3m/p59k4qdj0f17st0gn2cmj3640000gn/T//RtmpppsUPZ/test/main.R ... ok. (0.00 secs)
#> Reading /var/folders/3m/p59k4qdj0f17st0gn2cmj3640000gn/T//RtmpppsUPZ/test/plot_file_distribution.R ... ok. (0.00 secs)
knitr::kable(string_constants)
file_id | folder_path | file_name | string | count |
---|---|---|---|---|
file_01 | test | log.R | 1 | |
file_01 | test | log.R | … | 1 |
file_01 | test | log.R | *** | 3 |
file_01 | test | log.R | *** ok. | 1 |
file_02 | test | main.R | 2 | |
file_02 | test | main.R | *** The object is empty! | 1 |
file_02 | test | main.R | as requested but | 1 |
file_02 | test | main.R | _()$ | 1 |
file_02 | test | main.R | ! | 1 |
file_02 | test | main.R | . | 1 |
file_02 | test | main.R | ‘assign(x, value, envir = .GlobalEnv)’ | 1 |
file_02 | test | main.R | (returned in attribute ‘invalid’) | 1 |
file_02 | test | main.R | %s_%d | 1 |
file_02 | test | main.R | %s_1 | 1 |
file_02 | test | main.R | $ | 1 |
file_02 | test | main.R | assignGlobally() | 1 |
file_02 | test | main.R | Bug in randomValuesWithSum(): The sum of generated values is not | 1 |
file_02 | test | main.R | Division by zero. Using substitute value of | 1 |
file_02 | test | main.R | invalid | 1 |
file_02 | test | main.R | parName | 1 |
file_02 | test | main.R | parVal | 1 |
file_02 | test | main.R | stringsAsFactors | 1 |
file_02 | test | main.R | stringsAsFactors must be TRUE or FALSE | 1 |
file_02 | test | main.R | The first element must not be NA | 1 |
file_02 | test | main.R | There are differences in parallel non-NA values | 1 |
file_02 | test | main.R | There are duplicate values: | 1 |
file_03 | test | plot_file_distribution.R | lr | 1 |
file_03 | test | plot_file_distribution.R | v | 1 |
find_weaknesses_in_scripts()
Check for expressions in scripts that can be improved.
x <- parse(text = c(
"texts <- c(",
" paste('this is a very long', 'text'),",
" paste('this is a very long', 'string')",
")",
"",
"indices <- 1:length(texts)"
))
weaknesses <- kwb.code::find_weaknesses_in_scripts(
x = list(test = x),
min_duplicate_frequency = 2L
)
knitr::kable(weaknesses)
file | expression | frequency | recommendation | |
---|---|---|---|---|
1 | test | 1:length(texts) | 1 | use seq_along() |
3 | test | “this is a very long” | 2 | check for duplicated strings |
get_elements_by_type
This function groups similar elements that are found in a parse tree.
# Parse an R script file (here, a file from kwb.utils)
x <- parse(scripts[1L])
# For each "type" of code segment, extract all occurrences
elements <- kwb.code::get_elements_by_type(x, result = result)
# Show all code blocks in curly braces
elements[["language|call|{|2|"]]
#> NULL
get_full_function_info()
This function analyses a list of parse trees each of which has been read from an R script.
It provides information on the functions that are defined in the scripts:
- script: script name
- functionName: function name
- n.def: always 1 (?)
- bodyClass: always “{” (?)
- n.args: number of arguments
- n.defaults: number of arguments with a default
- n.expr: number of expressions
trees <- kwb.code::parse_scripts(root, dbg = FALSE)
function_info <- kwb.code::get_full_function_info(trees)
knitr::kable(function_info)
script | functionName | n.def | bodyClass | n.args | n.defaults | n.expr |
---|---|---|---|---|---|---|
log.R | .log | 1 | { | 1 | 0 | 1 |
log.R | .logline | 1 | { | 1 | 0 | 1 |
log.R | .logok | 1 | { | 1 | 1 | 1 |
log.R | .logstart | 1 | { | 2 | 1 | 1 |
plot_file_distribution.R | arrange_file_in_depth_plots | 1 | { | 3 | 0 | 5 |
main.R | assignAll | 1 | { | 3 | 1 | 2 |
main.R | assignGlobally | 1 | { | 2 | 0 | 2 |
main.R | assignPackageObjects | 1 | { | 1 | 0 | 1 |
main.R | breakInSequence | 1 | { | 2 | 1 | 2 |
main.R | callWithStringsAsFactors | 1 | { | 3 | 0 | 6 |
main.R | extendLimits | 1 | { | 4 | 2 | 3 |
main.R | getEvenNumbers | 1 | { | 1 | 0 | 1 |
main.R | getGlobally | 1 | { | 3 | 2 | 2 |
main.R | getOddNumbers | 1 | { | 1 | 0 | 1 |
main.R | hsMatrixToListForm | 1 | { | 6 | 4 | 4 |
main.R | hsSafeName | 1 | { | 2 | 0 | 3 |
main.R | makeUnique | 1 | { | 4 | 3 | 3 |
main.R | naToLastNonNa | 1 | { | 2 | 1 | 6 |
main.R | parallelNonNA | 1 | { | 2 | 0 | 12 |
plot_file_distribution.R | plot_file_distribution | 1 | { | 5 | 1 | 11 |
main.R | quotient | 1 | { | 4 | 2 | 6 |
main.R | randomValuesWithSum | 1 | { | 3 | 1 | 5 |
main.R | recursiveNames | 1 | { | 2 | 1 | 4 |
main.R | warnIfEmpty | 1 | { | 1 | 0 | 2 |
get_names_of_used_packages()
What packages are used in the scripts?
kwb.code::get_names_of_used_packages(root)
#> Analysing /var/folders/3m/p59k4qdj0f17st0gn2cmj3640000gn/T//RtmpppsUPZ/test/log.R ... ok. (0.00 secs)
#> Analysing /var/folders/3m/p59k4qdj0f17st0gn2cmj3640000gn/T//RtmpppsUPZ/test/main.R ... ok. (0.00 secs)
#> Analysing /var/folders/3m/p59k4qdj0f17st0gn2cmj3640000gn/T//RtmpppsUPZ/test/plot_file_distribution.R ... ok. (0.00 secs)
#> character(0)
This function simply looks for calls to library()
. It
does not take into account functions that are called with
::
as the following simple grep()
reveals:
pattern <- "[^A-Za-z_.]([A-Za-z_.]+::[A-Za-z_.]+)[^A-Za-z_.]"
text <- grep(pattern, readLines(scripts[3L]), value = TRUE)
unique(kwb.utils::extractSubstring(pattern, text, index = 1))
#> [1] "kwb.pathdict::random_paths" "kwb.utils::noFactorDataFrame"
#> [3] "kwb.fakin::plot_file_distribution" "kwb.utils::assignPackageObjects"
#> [5] "kwb.utils::preparePdfIf" "kwb.utils::finishAndShowPdfIf"
#> [7] "cowplot::plot_grid"
TODO: Use another function instead…
get_package_function_usage()
tree <- kwb.code::parse_scripts(root, dbg = FALSE)
function_usage <- kwb.code::get_package_function_usage(
tree,
package = "kwb.utils"
)
knitr::kable(function_usage)
package | name | count | explicit | implicit |
---|---|---|---|---|
kwb.utils | catIf | 2 | 0 | 2 |
kwb.utils | finishAndShowPdfIf | 1 | 1 | 0 |
kwb.utils | hsRestoreAttributes | 1 | 0 | 1 |
kwb.utils | isEvenNumber | 1 | 0 | 1 |
kwb.utils | isNullOrEmpty | 1 | 0 | 1 |
kwb.utils | isOddNumber | 1 | 0 | 1 |
kwb.utils | lastElement | 1 | 0 | 1 |
kwb.utils | preparePdfIf | 1 | 1 | 0 |
kwb.utils | rbindAll | 1 | 0 | 1 |
kwb.utils | recursiveNames | 1 | 0 | 1 |
kwb.utils | stringList | 1 | 0 | 1 |
kwb.utils | warningDeprecated | 1 | 0 | 1 |
get_package_usage_per_script()
package_usage <- kwb.code::get_package_usage_per_script(
root,
packages = "kwb.utils"
)
#> Reading /var/folders/3m/p59k4qdj0f17st0gn2cmj3640000gn/T//RtmpppsUPZ/test/log.R ... ok. (0.00 secs)
#> Reading /var/folders/3m/p59k4qdj0f17st0gn2cmj3640000gn/T//RtmpppsUPZ/test/main.R ... ok. (0.00 secs)
#> Reading /var/folders/3m/p59k4qdj0f17st0gn2cmj3640000gn/T//RtmpppsUPZ/test/plot_file_distribution.R ... ok. (0.00 secs)
#> 3 scripts have been parsed.
#> Checking usage of kwb.utils ...
#> ok. (0.01 secs)
knitr::kable(package_usage)
package | script | name | count | explicit | implicit |
---|---|---|---|---|---|
kwb.utils | log.R | catIf | 2 | 0 | 2 |
kwb.utils | main.R | hsRestoreAttributes | 1 | 0 | 1 |
kwb.utils | main.R | isEvenNumber | 1 | 0 | 1 |
kwb.utils | main.R | isNullOrEmpty | 1 | 0 | 1 |
kwb.utils | main.R | isOddNumber | 1 | 0 | 1 |
kwb.utils | main.R | lastElement | 1 | 0 | 1 |
kwb.utils | main.R | rbindAll | 1 | 0 | 1 |
kwb.utils | main.R | recursiveNames | 1 | 0 | 1 |
kwb.utils | main.R | stringList | 1 | 0 | 1 |
kwb.utils | main.R | warningDeprecated | 1 | 0 | 1 |
kwb.utils | plot_file_distribution.R | finishAndShowPdfIf | 1 | 1 | 0 |
kwb.utils | plot_file_distribution.R | preparePdfIf | 1 | 1 | 0 |
get_string_constants_in_scripts()
string_constants <- kwb.code::get_string_constants_in_scripts(root)
#> Splitting paths ... ok. (0.00 secs)
#> Removing the first 8 path segments ... ok. (0.00 secs)
#> Reading /var/folders/3m/p59k4qdj0f17st0gn2cmj3640000gn/T//RtmpppsUPZ/test/log.R ... ok. (0.00 secs)
#> Reading /var/folders/3m/p59k4qdj0f17st0gn2cmj3640000gn/T//RtmpppsUPZ/test/main.R ... ok. (0.00 secs)
#> Reading /var/folders/3m/p59k4qdj0f17st0gn2cmj3640000gn/T//RtmpppsUPZ/test/plot_file_distribution.R ... ok. (0.00 secs)
knitr::kable(string_constants)
file_id | string | count |
---|---|---|
file_01 | 1 | |
file_01 | … | 1 |
file_01 | *** | 3 |
file_01 | *** ok. | 1 |
file_02 | 2 | |
file_02 | *** The object is empty! | 1 |
file_02 | as requested but | 1 |
file_02 | _()$ | 1 |
file_02 | ! | 1 |
file_02 | . | 1 |
file_02 | ‘assign(x, value, envir = .GlobalEnv)’ | 1 |
file_02 | (returned in attribute ‘invalid’) | 1 |
file_02 | %s_%d | 1 |
file_02 | %s_1 | 1 |
file_02 | $ | 1 |
file_02 | assignGlobally() | 1 |
file_02 | Bug in randomValuesWithSum(): The sum of generated values is not | 1 |
file_02 | Division by zero. Using substitute value of | 1 |
file_02 | invalid | 1 |
file_02 | parName | 1 |
file_02 | parVal | 1 |
file_02 | stringsAsFactors | 1 |
file_02 | stringsAsFactors must be TRUE or FALSE | 1 |
file_02 | The first element must not be NA | 1 |
file_02 | There are differences in parallel non-NA values | 1 |
file_02 | There are duplicate values: | 1 |
file_03 | lr | 1 |
file_03 | v | 1 |
parse_scripts()
x <- kwb.code::parse_scripts(root)
#> Reading /var/folders/3m/p59k4qdj0f17st0gn2cmj3640000gn/T//RtmpppsUPZ/test/log.R ... ok. (0.00 secs)
#> Reading /var/folders/3m/p59k4qdj0f17st0gn2cmj3640000gn/T//RtmpppsUPZ/test/main.R ... ok. (0.00 secs)
#> Reading /var/folders/3m/p59k4qdj0f17st0gn2cmj3640000gn/T//RtmpppsUPZ/test/plot_file_distribution.R ... ok. (0.00 secs)
str(x, 2)
#> List of 3
#> $ log.R :length 4 expression(.log <- function(...) { cat("***", ...) }, .logline <- function(...) { cat("***", ..., "\n") }, .log| __truncated__ ...
#> ..- attr(*, "n.lines")= int 50
#> $ main.R :length 18 expression(randomValuesWithSum <- function(n, sumOfValues, names = seq_len(n)) { breaks <- sort(sample(sumOfValu| __truncated__ ...
#> ..- attr(*, "n.lines")= int 575
#> $ plot_file_distribution.R:length 2 expression(plot_file_distribution <- function(file_data, start_path, n_root_parts, ..., to_pdf = TRUE) { data_s| __truncated__
#> ..- attr(*, "n.lines")= int 93
to_full_script_info()
This function creates statistics about R scripts.
script_statistics <- kwb.code::to_full_script_info(trees)
knitr::kable(script_statistics)
script | errors | rows | expr | rpe | <- | fun |
---|---|---|---|---|---|---|
log.R | 50 | 4 | 12.5 | 4 | 4 | |
main.R | 575 | 18 | 31.9 | 18 | 18 | |
plot_file_distribution.R | 93 | 2 | 46.5 | 2 | 2 |
Interesting base functions
getParseData()
"sum(kwb.utils::selectColumns(df, col))" %>%
parse(text = ., keep.source = TRUE) %>%
getParseData()
#> line1 col1 line2 col2 id parent token terminal text
#> 26 1 1 1 38 26 0 expr FALSE
#> 1 1 1 1 3 1 3 SYMBOL_FUNCTION_CALL TRUE sum
#> 3 1 1 1 3 3 26 expr FALSE
#> 2 1 4 1 4 2 26 '(' TRUE (
#> 21 1 5 1 37 21 26 expr FALSE
#> 7 1 5 1 28 7 21 expr FALSE
#> 4 1 5 1 13 4 7 SYMBOL_PACKAGE TRUE kwb.utils
#> 5 1 14 1 15 5 7 NS_GET TRUE ::
#> 6 1 16 1 28 6 7 SYMBOL_FUNCTION_CALL TRUE selectColumns
#> 8 1 29 1 29 8 21 '(' TRUE (
#> 9 1 30 1 31 9 11 SYMBOL TRUE df
#> 11 1 30 1 31 11 21 expr FALSE
#> 10 1 32 1 32 10 21 ',' TRUE ,
#> 15 1 34 1 36 15 17 SYMBOL TRUE col
#> 17 1 34 1 36 17 21 expr FALSE
#> 16 1 37 1 37 16 21 ')' TRUE )
#> 22 1 38 1 38 22 26 ')' TRUE )
"do.call(kwb.utils::selectColumns, list(df, col))" %>%
parse(text = ., keep.source = TRUE) %>%
getParseData()
#> line1 col1 line2 col2 id parent token terminal text
#> 32 1 1 1 48 32 0 expr FALSE
#> 1 1 1 1 7 1 3 SYMBOL_FUNCTION_CALL TRUE do.call
#> 3 1 1 1 7 3 32 expr FALSE
#> 2 1 8 1 8 2 32 '(' TRUE (
#> 7 1 9 1 32 7 32 expr FALSE
#> 4 1 9 1 17 4 7 SYMBOL_PACKAGE TRUE kwb.utils
#> 5 1 18 1 19 5 7 NS_GET TRUE ::
#> 6 1 20 1 32 6 7 SYMBOL TRUE selectColumns
#> 8 1 33 1 33 8 32 ',' TRUE ,
#> 27 1 35 1 47 27 32 expr FALSE
#> 12 1 35 1 38 12 14 SYMBOL_FUNCTION_CALL TRUE list
#> 14 1 35 1 38 14 27 expr FALSE
#> 13 1 39 1 39 13 27 '(' TRUE (
#> 15 1 40 1 41 15 17 SYMBOL TRUE df
#> 17 1 40 1 41 17 27 expr FALSE
#> 16 1 42 1 42 16 27 ',' TRUE ,
#> 21 1 44 1 46 21 23 SYMBOL TRUE col
#> 23 1 44 1 46 23 27 expr FALSE
#> 22 1 47 1 47 22 27 ')' TRUE )
#> 28 1 48 1 48 28 32 ')' TRUE )