Codemetar Analysis
Michael Rustler
2022-10-17
Source:vignettes/codemetar-analysis.Rmd
codemetar-analysis.Rmd
Topics
library(jsonld)
library(jsonlite)
library(magrittr)
library(codemetar)
library(purrr)
library(dplyr)
library(printr)
library(tibble)
##################################################################
### Importing "codemetar.json"
##################################################################
### Option 1:
### Not working correctly (-> gives list that is twice as long as just
### using jsonlite::fromJSON as defined below
# frame <- system.file("schema/frame_schema.json", package="codemetar")
#
# corpus <- jsonld::jsonld_frame("codemetar.json", frame) %>%
# jsonlite::fromJSON("codemetar.json",
# simplifyVector = FALSE) %>%
# getElement("@graph")
#"https://kwb-r.github.io/pkgmeta/codemetar.json" %>%
# jsonlite::fromJSON() %>%
# jsonlite::write_json("codemetar.json")
### Option 2: working as expected
corpus <- jsonlite::fromJSON("codemetar.json",
simplifyVector = FALSE)
### add Github topics to R package names
pkg_topics <- tibble::tibble(
name = purrr::map_chr(corpus, "identifier"),
topic = purrr::map_chr(
lapply(purrr::map(corpus, "keywords"),
function(x) {paste(unlist(x),
collapse = ", ")}),1))
pkg_topics
topics <- tidyr::separate_rows(pkg_topics, topic, sep = ",\\s+") %>%
dplyr::count(topic) %>%
dplyr::filter(!topic %in% c("r", "rstats")) %>%
dplyr::arrange(dplyr::desc(n)) %>%
dplyr::rename(word = topic,
freq = n)
knitr::kable(topics)
wordcloud2::wordcloud2(topics)
Authors
## deal with nulls explicitly by starting with map
pkgs <- purrr::map(corpus, "name") %>%
purrr::compact() %>%
as.character()
# keep only those with package identifiers (names)
keep <- purrr::map_lgl(corpus, ~ length(.x$identifier) > 0)
corpus <- corpus[keep]
## now we can just do
all_pkgs <- purrr::map_chr(corpus, "name")
head(all_pkgs)
## 3 unique maintainers
purrr::map(corpus, "maintainer") %>%
purrr::map(1) %>%
purrr::map("familyName") %>%
unique() %>%
length()
Maintainer
## Mostly Hauke
maintainer <- purrr::map(corpus, "maintainer") %>%
purrr::map(1) %>%
purrr::map("familyName") %>%
unlist() %>%
tibble::enframe(name = NULL) %>%
dplyr::group_by(value) %>%
dplyr::tally(sort=TRUE) %>%
dplyr::rename(word = value,
freq = n)
knitr::kable(maintainer)
wordcloud2::wordcloud2(maintainer,
minSize = 0.4,
size = 0.6)
## number of co-authors ...
purrr::map_int(corpus, function(r) length(r$author)) %>%
tibble::enframe() %>%
dplyr::group_by(value) %>%
dplyr::tally(sort=TRUE)
## Contributors isn't used as much...
purrr::map_int(corpus, function(r) length(r$contributor)) %>%
tibble::enframe() %>%
dplyr::group_by(value) %>%
dplyr::tally(sort=TRUE)
Package Dependencies
Depends Imports
purrr::map_int(corpus, function(r) length(r$softwareRequirements)) %>%
tibble::enframe() %>%
dplyr::group_by(value) %>%
dplyr::tally(sort=TRUE)
corpus %>%
map_df(function(x){
## single, unboxed dep
if("name" %in% names(x$softwareRequirements))
dep <- x$identifier
else if("name" %in% names(x$softwareRequirements))
dep <- map_chr(x$softwareRequirements, "name")
else { ## No requirements
dep <- NA
}
tibble(identifier = x$identifier, dep = dep)
})
deps_df <- tibble(identifier = purrr::map_chr(corpus, "identifier"),
deps = lapply(purrr::map(corpus, "softwareRequirements"),
FUN = function(x) {
deps <- purrr::map_chr(x, "name") %>%
unlist() %>%
paste(collapse = ",") %>%
as.character()
deps <- ifelse(stringr::str_length(deps) > 0, deps, NA_character_)}))
dep_df <- tidyr::separate_rows(deps_df, deps, sep = ",") %>%
dplyr::select(deps, identifier) %>%
dplyr::arrange(deps, identifier) %>%
dplyr::rename(dependancy = deps, used_in_r_package = identifier)
#which dependencies are used most frequently?
dep_count <- dep_df %>%
dplyr::group_by(dependancy) %>%
dplyr::tally(sort = TRUE) %>%
dplyr::rename(word = dependancy,
freq = n)
knitr::kable(dep_count)
wordcloud2::wordcloud2(dep_count,
minSize = 0.4,
size = 1)
Depends Imports and Suggests
#Alternate approach using a frame instead of purrr functions for subsetting the
#Note that this gets all Depends and suggests (really all SoftwareApplication
#types mentioned)
dep_frame <- '{
"@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
"@explicit": "true",
"name": {}
}'
dep_sug_count <- jsonld_frame("codemetar.json", dep_frame) %>%
fromJSON() %>%
getElement("@graph") %>%
filter(type == "SoftwareApplication") %>%
group_by(name) %>%
tally(sort = TRUE) %>%
dplyr::rename(word = name,
freq = n)
knitr::kable(dep_sug_count)
wordcloud2::wordcloud2(dep_sug_count,
minSize = 0.4,
size = 1)