Codemetar Analysis

Topics


library(jsonld)
library(jsonlite)
library(magrittr)
library(codemetar)
library(purrr)
library(dplyr)
library(printr)
library(tibble)
##################################################################
### Importing "codemetar.json" 
##################################################################

### Option 1:
### Not working correctly (-> gives list that is twice as long as just
### using jsonlite::fromJSON as defined below

# frame <- system.file("schema/frame_schema.json", package="codemetar")
# 
# corpus <- jsonld::jsonld_frame("codemetar.json", frame) %>%
#  jsonlite::fromJSON("codemetar.json",
#                              simplifyVector = FALSE) %>%
#   getElement("@graph")


#"https://kwb-r.github.io/pkgmeta/codemetar.json" %>%  
#  jsonlite::fromJSON() %>% 
#  jsonlite::write_json("codemetar.json")

### Option 2: working as expected

corpus <- jsonlite::fromJSON("codemetar.json",
                             simplifyVector = FALSE)

### add Github topics to R package names
pkg_topics <- tibble::tibble(
       name = purrr::map_chr(corpus, "identifier"),
       topic = purrr::map_chr(
               lapply(purrr::map(corpus, "keywords"), 
                      function(x) {paste(unlist(x), 
                                         collapse = ", ")}),1))
pkg_topics

topics <- tidyr::separate_rows(pkg_topics, topic, sep = ",\\s+") %>% 
  dplyr::count(topic) %>% 
  dplyr::filter(!topic %in% c("r", "rstats")) %>% 
  dplyr::arrange(dplyr::desc(n)) %>% 
  dplyr::rename(word = topic,
                freq = n) 


knitr::kable(topics)

wordcloud2::wordcloud2(topics)

Authors


## deal with nulls explicitly by starting with map
pkgs <- purrr::map(corpus, "name") %>%
  purrr::compact() %>%
  as.character()

# keep only those with package identifiers (names)
keep <- purrr::map_lgl(corpus, ~ length(.x$identifier) > 0)
corpus <- corpus[keep]

## now we can just do
all_pkgs <- purrr::map_chr(corpus, "name")
head(all_pkgs)

## 3 unique maintainers
purrr::map(corpus, "maintainer") %>%  
  purrr::map(1) %>%  
  purrr::map("familyName") %>% 
  unique() %>%
  length()

Maintainer

## Mostly Hauke
maintainer <- purrr::map(corpus, "maintainer") %>%  
  purrr::map(1) %>%  
  purrr::map("familyName") %>%
  unlist() %>% 
  tibble::enframe(name = NULL) %>%
  dplyr::group_by(value) %>%
  dplyr::tally(sort=TRUE) %>%
  dplyr::rename(word = value,
                freq = n) 

knitr::kable(maintainer)

wordcloud2::wordcloud2(maintainer,
                       minSize = 0.4, 
                       size = 0.6)

## number of co-authors ...
purrr::map_int(corpus, function(r) length(r$author)) %>%
  tibble::enframe() %>%
  dplyr::group_by(value) %>%
  dplyr::tally(sort=TRUE) 

## Contributors isn't used as much...
purrr::map_int(corpus, function(r) length(r$contributor)) %>%
  tibble::enframe() %>%
  dplyr::group_by(value) %>%
  dplyr::tally(sort=TRUE)

Package Dependencies

Depends Imports

purrr::map_int(corpus, function(r) length(r$softwareRequirements))  %>%
  tibble::enframe() %>%
  dplyr::group_by(value) %>%
  dplyr::tally(sort=TRUE)

corpus %>%
  map_df(function(x){
    ## single, unboxed dep
    if("name" %in% names(x$softwareRequirements))
      dep <- x$identifier
    else if("name" %in% names(x$softwareRequirements))
      dep <- map_chr(x$softwareRequirements, "name")
    else { ## No requirements
      dep <- NA
    }

    tibble(identifier = x$identifier, dep = dep)
  }) 

deps_df <- tibble(identifier = purrr::map_chr(corpus, "identifier"),
       deps = lapply(purrr::map(corpus, "softwareRequirements"), 
                     FUN = function(x) {
                       deps <- purrr::map_chr(x, "name") %>%  
                         unlist() %>%  
                         paste(collapse = ",") %>% 
                         as.character()
                       
                       deps <- ifelse(stringr::str_length(deps) > 0, deps, NA_character_)}))

dep_df <- tidyr::separate_rows(deps_df, deps, sep = ",") %>% 
  dplyr::select(deps, identifier) %>%
  dplyr::arrange(deps, identifier) %>% 
  dplyr::rename(dependancy = deps, used_in_r_package = identifier)




#which dependencies are used most frequently?
dep_count <- dep_df %>%
  dplyr::group_by(dependancy) %>%
  dplyr::tally(sort = TRUE) %>%
  dplyr::rename(word = dependancy,
                freq = n)

knitr::kable(dep_count)

wordcloud2::wordcloud2(dep_count,
                       minSize = 0.4, 
                       size = 1)

Depends Imports and Suggests

#Alternate approach using a frame instead of purrr functions for subsetting the
#Note that this gets all Depends and suggests (really all SoftwareApplication
#types mentioned)
dep_frame <- '{
  "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
"@explicit": "true",
"name": {}
}'
dep_sug_count <- jsonld_frame("codemetar.json", dep_frame) %>%
  fromJSON() %>%
  getElement("@graph") %>%
  filter(type == "SoftwareApplication") %>%
  group_by(name) %>%
  tally(sort = TRUE) %>%
  dplyr::rename(word = name,
                freq = n) 

knitr::kable(dep_sug_count)

wordcloud2::wordcloud2(dep_sug_count,
                       minSize = 0.4, 
                       size = 1)

Michael Rustler

2022-10-17

Topics

Authors

Maintainer

Package Dependencies

Depends Imports

Depends Imports and Suggests

7 Session Info

Plattform

Packages

Pandoc