Skip to contents

1 Install R packages

if (! require("remotes")) {
  install.packages("remotes", repos = "https://cloud.r-project.org")
}

remotes::install_github("kwb-r/kwb.geosalz", dependencies = TRUE)

2 Setup the project

2.1 Define paths

# Define paths and resolve placeholders
paths <- list(
  servername = Sys.getenv("servername"),
  root_server = "//<servername>/processing",
  project = "geosalz",
  task = "labor",
  processing = "<root>/<project>/<task>",
  input_dir = "<processing>/precleaned-data/v0.2",
  input_dir_meta = "<input_dir>/META",
  export_dir = "<processing>/precleaned-data/v0.3",
  export_dir_meta = "<export_dir>/META",
  cleaned_data_dir = "<processing>/cleaned-data",
  figures_dir = "<processing>/figures",
  foerdermengen = "<export_dir>/2018-04-27 Rohwasser Bericht - Galeriefördermengen.xlsx",
  parameters = "<export_dir_meta>/2018-06-01 Lab Parameter.xlsx",
  lookup_para = "<export_dir_meta>/lookup_para.csv",
  sites = "<export_dir_meta>/Info-Altdaten.xlsx"
)

paths <- kwb.utils::resolve(paths, root = "root_server")
#paths <- kwb.utils::resolve(paths, root = "C:/projects")

2.2 Check input directory

Check if input directory selected above exists:

# Check if input directory exists
input_dir <- kwb.utils::safePath(kwb.utils::selectElements(paths, "input_dir"))

2.3 Check export directory

Check and in case the export directory is not available create it:

# Check if exists and if not create it

if (!dir.exists(paths$export_dir)) {
  print(sprintf("Creating export directory: %s", paths$export_dir))
  fs::dir_create(paths$export_dir)
}

export_dir <- kwb.utils::safePath(kwb.utils::selectElements(paths, "export_dir"))
## Convert xls to xlsx files
kwb.geosalz::convert_xls_as_xlsx(input_dir, export_dir)

## Copy xlsx files 
kwb.geosalz::copy_xlsx_files(input_dir, export_dir, overwrite = TRUE)

3 Data import

3.1 Categorise the Excel Files

Select files if they consist of:

  • Header 1 (meta): with manually added metadata in an extra sheet
  • Header 1: single header tables
  • Header 2: two-row header tables or
  • Header 4: four row header tables
# Get all xlsx files to be imported
files <- dir(export_dir, ".xlsx", recursive = TRUE, full.names = TRUE)

files_meta <- c(
  "Meta Info",
  "Header ident",
  "Parameter ident",
  "Parameter",
  "Info-Altdaten",
  "Brandenburg_Parameter_BWB_Stolpe",
  "Kopie von Brandenburg_Parameter_BWB_Stolpe",
  "2005-10BeschilderungProbenahmestellenGWWIII",
  "Bezeichnungen der Reinwasserstellen",
  "ReinwasserNomenklatur",
  "Info zu Altdaten 1970-1998",
  "2018-06-01 Lab Parameter"
)

files_header1_meta <- c(
  "FRI_Br_GAL_C_Einzelparameter",
  "FRI_Roh_Rein_NH4+NO3_2001-2003",
  "MTBE_2003-11_2004",
  "Reinwasser_2003_Fe_Mn", ## unclean
  "VC_CN_in Brunnen bis Aug_2005 ", ## unclean
  "Wuhlheide_Beelitzhof_Teildaten" ## unclean
)

files_header1 <- c(
  "2018-04-11 Chlorid in Brunnen - Übersicht",
  "2018-04-27 LIMS Reiw & Rohw Sammel ",
  "2018-04-27 Rohwasser Bericht - Galeriefördermengen"
)

files_header4 <- c(
  "STO Rohw_1999-6_2004",
  "Wuhlheide_1999-2003_Okt - Neu",
  "KAU_1999-Okt2003"
)

files_archive <- "Siebert"

files_to_ignore <- c(
  files_meta, files_header1, files_header1_meta,
  files_header4, files_archive
)

in_files_to_ignore <- kwb.utils::removeExtension(basename(files)) %in% 
  files_to_ignore

filepaths_header2 <- files[!in_files_to_ignore]

3.2 Header 1 (with metadata)

cond1 <- kwb.utils::removeExtension(basename(files)) %in% files_header1_meta

filepaths_header1_meta <- files[cond1]

labor_list_1meta <- kwb.geosalz::import_labor(
  filepaths_header1_meta ,
  export_dir,
  func = kwb.geosalz::read_bwb_header1_meta
)

has_errors <- sapply(labor_list_1meta, kwb.utils::isTryError)
#has_errors

labor_df_1meta <- data.table::rbindlist(
  l = labor_list_1meta[!has_errors],
  fill = TRUE
)
library(dplyr)

cond2 <- labor_df_1meta$OriginalName %in% c(
  "el. Leitfähigkeit (25 °C)", "Chlorid", "Sulfat"
)

labor_df_1meta[cond2 & !is.na(labor_df_1meta$DataValue), ] %>%
  dplyr::group_by_("OriginalName", "Meßstelle") %>%
  dplyr::summarise(n = n())

3.3 Header 2

labor_header2_list <- kwb.geosalz::import_labor(
  files = filepaths_header2,
  export_dir = export_dir,
  func = kwb.geosalz::read_bwb_header2
)

has_errors <- sapply(labor_header2_list, kwb.utils::isTryError)
#has_errors

labor_header2_df <- data.table::rbindlist(
  l = labor_header2_list[!has_errors],
  fill = TRUE
)

3.4 Header 4

cond3 <- kwb.utils::removeExtension(basename(files)) %in% files_header4

filepaths_header4 <- files[cond3]

labor_header4_list <- kwb.geosalz::import_labor(
  files = filepaths_header4,
  export_dir = export_dir,
  func = kwb.geosalz::read_bwb_header4
)

has_errors <- sapply(labor_header4_list, inherits, "try-error")
has_errors

labor_header4_df <- data.table::rbindlist(
  l = labor_header4_list[!has_errors],
  fill = TRUE
)

4 Data cleaning

4.1 Merging Header2 and Header4

labor_all <- data.table::rbindlist(
  l = list(
    x1 = labor_header2_df,
    x2 = labor_header4_df
  ),
  fill = TRUE
)

4.2 Filtering

labor_all <- labor_all %>%
  dplyr::filter(!is.na(.data$DataValue)) %>%
  dplyr::mutate(
    Date = dplyr::if_else(
      condition = !is.na(.data$Datum),
      true = .data$Datum,
      false = .data$Probenahme
    )
  ) %>%
  ### Some "Datum" rentries are missing in;
  ### K-TL_LSW-Altdaten-Werke Teil 1\Werke Teil 1\Kaulsdorf\KAU_1999-Okt2003.xlsx
  ### sheets: 66 KAU Rein 1999-2000, 65 KAU NordSüd 1999-2000
  dplyr::filter(!is.na(.data$Date)) %>%
  dplyr::filter(!is.na(.data$VariableName_org))

nrow(labor_all)

4.3 Reducing dataset by adding metadata

labordaten_ww <- kwb.geosalz::add_para_metadata(
  df = labor_all,
  lookup_para_path = paths$lookup_para,
  parameters_path = paths$parameters
)

labordaten_ww <- kwb.geosalz::add_site_metadata(
  df = labordaten_ww,
  site_path = paths$sites
) %>%
  dplyr::mutate(
    year = as.numeric(format(.data$Date,format = '%Y')),
    DataValue = as.numeric(.data$DataValue)
  )

nrow(labordaten_ww)

5. Exporting cleaned dataset

fs::dir_create(paths$cleaned_data_dir, recursive = TRUE)
print(sprintf("Export cleaned data to: %s", paths$cleaned_data_dir))
foerdermengen_ww <- kwb.geosalz::get_foerdermengen(paths$foerdermengen)

save(
  labordaten_ww, 
  foerdermengen_ww,
  file = file.path(paths$cleaned_data_dir, "cleaned-data.Rds")
)

write.csv2(
  labordaten_ww,
  file.path(paths$cleaned_data_dir, "labordaten_ww.csv"),
  row.names = FALSE
)

write.csv2(
  foerdermengen_ww,
  file = file.path(paths$cleaned_data_dir, "foerdermengen_ww.csv"),
  row.names = FALSE
)

6. Visualisation

library(ggplot2)

fs::dir_create(paths$figures_dir, recursive = TRUE)
print(sprintf("Export figures/plots to: %s", paths$figures_dir))

para_info <- kwb.geosalz::get_parameters_meta(paths$parameters)
water_types <- c("Reinwasser", "Rohwasser")

for (water_type in water_types) {
  
  pdf_file <- file.path(
    paths$figures_dir,
    sprintf(
      "Zeitreihen_Jahresmittelwerte_Werke_%s.pdf",
      water_type
    )
  )
  
  cat(sprintf("Creating plot:\n%s\n", pdf_file))
  
  pdf(file = pdf_file, width = 14, height = 9)
  
  for (sel_para_id in unique(labordaten_ww$para_id)) {
    
    my_selection <- sprintf(
      "%s (%s)",
      para_info$para_kurzname[para_info$para_id == sel_para_id],
      water_type
    )
    
    tmp <- labordaten_ww %>%
      dplyr::filter(prufgegenstand == water_type) %>%
      dplyr::filter(para_id == sel_para_id) %>%
      dplyr::group_by(.data$para_kurzname, .data$werk, .data$year) %>%
      dplyr::summarise(
        mean_DataValue = mean(as.numeric(.data$DataValue), na.rm = TRUE)
      ) %>%
      dplyr::filter(!is.na(.data$werk)) %>%
      dplyr::left_join(
        y = kwb.geosalz::get_foerdermengen(paths$foerdermengen),
        by = c("werk", "year")
      )
    
    if (nrow(tmp) > 0) {
      
      cat(sprintf("for %s\n", my_selection))
      
      g <- ggplot2::ggplot(tmp, mapping = ggplot2::aes_string(
        x = "year",
        y = "mean_DataValue",
        col = "werk"
      )) +
        ggplot2::geom_point() +
        ggplot2::geom_line() +
        ggplot2::theme_bw() +
        ggplot2::ggtitle(label = my_selection) +
        ggplot2::labs(x = "", y = "Jahresmittelwert")
      
      print(g)
      
    } else {
      
      cat(sprintf("not data availabe for %s\n", my_selection))
    }
  }
  dev.off()
}

7 Session Info

Plattform

name value
version R version 4.2.3 (2023-03-15 ucrt)
os Windows Server x64 (build 20348)
system x86_64, mingw32
ui RTerm
language en
collate English_United States.utf8
ctype English_United States.utf8
tz UTC
date 2023-04-15
pandoc 2.19.2 @ C:/HOSTED1/windows/pandoc/2191.2/x64/PANDOC~1.2/ (via rmarkdown)

Packages

#>  package     * version date (UTC) lib source
#>  bslib         0.4.2   2022-12-16 [1] RSPM (R 4.2.0)
#>  cachem        1.0.7   2023-02-24 [1] RSPM (R 4.2.0)
#>  cli           3.6.1   2023-03-23 [1] RSPM (R 4.2.0)
#>  desc          1.4.2   2022-09-08 [1] RSPM (R 4.2.0)
#>  digest        0.6.31  2022-12-11 [1] RSPM (R 4.2.0)
#>  dplyr       * 1.1.1   2023-03-22 [1] RSPM (R 4.2.0)
#>  evaluate      0.20    2023-01-17 [1] RSPM (R 4.2.0)
#>  fansi         1.0.4   2023-01-22 [1] RSPM (R 4.2.0)
#>  fastmap       1.1.1   2023-02-24 [1] RSPM (R 4.2.0)
#>  fs            1.6.1   2023-02-06 [1] RSPM (R 4.2.0)
#>  generics      0.1.3   2022-07-05 [1] RSPM (R 4.2.0)
#>  glue          1.6.2   2022-02-24 [1] RSPM (R 4.2.0)
#>  htmltools     0.5.5   2023-03-23 [1] RSPM (R 4.2.0)
#>  jquerylib     0.1.4   2021-04-26 [1] RSPM (R 4.2.0)
#>  jsonlite      1.8.4   2022-12-06 [1] RSPM (R 4.2.0)
#>  knitr         1.42    2023-01-25 [1] RSPM (R 4.2.0)
#>  kwb.utils     0.14.0  2023-04-15 [1] Github (KWB-R/kwb.utils@56518c7)
#>  lifecycle     1.0.3   2022-10-07 [1] RSPM (R 4.2.0)
#>  magrittr      2.0.3   2022-03-30 [1] RSPM (R 4.2.0)
#>  memoise       2.0.1   2021-11-26 [1] RSPM (R 4.2.0)
#>  pillar        1.9.0   2023-03-22 [1] RSPM (R 4.2.0)
#>  pkgconfig     2.0.3   2019-09-22 [1] RSPM (R 4.2.0)
#>  pkgdown       2.0.7   2022-12-14 [1] RSPM (R 4.2.0)
#>  purrr         1.0.1   2023-01-10 [1] RSPM (R 4.2.0)
#>  R6            2.5.1   2021-08-19 [1] RSPM (R 4.2.0)
#>  ragg          1.2.5   2023-01-12 [1] RSPM (R 4.2.0)
#>  rlang         1.1.0   2023-03-14 [1] RSPM (R 4.2.0)
#>  rmarkdown     2.21    2023-03-26 [1] RSPM (R 4.2.0)
#>  rprojroot     2.0.3   2022-04-02 [1] RSPM (R 4.2.0)
#>  sass          0.4.5   2023-01-24 [1] RSPM (R 4.2.0)
#>  sessioninfo   1.2.2   2021-12-06 [1] RSPM (R 4.2.0)
#>  stringi       1.7.12  2023-01-11 [1] RSPM (R 4.2.0)
#>  stringr       1.5.0   2022-12-02 [1] RSPM (R 4.2.0)
#>  systemfonts   1.0.4   2022-02-11 [1] RSPM (R 4.2.0)
#>  textshaping   0.3.6   2021-10-13 [1] RSPM (R 4.2.0)
#>  tibble        3.2.1   2023-03-20 [1] RSPM (R 4.2.0)
#>  tidyr       * 1.3.0   2023-01-24 [1] RSPM (R 4.2.0)
#>  tidyselect    1.2.0   2022-10-10 [1] RSPM (R 4.2.0)
#>  utf8          1.2.3   2023-01-31 [1] RSPM (R 4.2.0)
#>  vctrs         0.6.1   2023-03-22 [1] RSPM (R 4.2.0)
#>  xfun          0.38    2023-03-24 [1] RSPM (R 4.2.0)
#>  yaml          2.3.7   2023-01-23 [1] RSPM (R 4.2.0)
#> 
#>  [1] D:/a/_temp/Library
#>  [2] C:/R/library

Pandoc

#>                                           pandoc_directory pandoc_version
#> 1 C:/HOSTED~1/windows/pandoc/219~1.2/x64/PANDOC~1.2/pandoc         2.19.2