1 Install R packages
if (! require("remotes")) {
install.packages("remotes", repos = "https://cloud.r-project.org")
}
remotes::install_github("kwb-r/kwb.geosalz", dependencies = TRUE)
2 Setup the project
2.1 Define paths
# Define paths and resolve placeholders
paths <- list(
servername = Sys.getenv("servername"),
root_server = "//<servername>/processing",
project = "geosalz",
task = "labor",
processing = "<root>/<project>/<task>",
input_dir = "<processing>/precleaned-data/v0.2",
input_dir_meta = "<input_dir>/META",
export_dir = "<processing>/precleaned-data/v0.3",
export_dir_meta = "<export_dir>/META",
cleaned_data_dir = "<processing>/cleaned-data",
figures_dir = "<processing>/figures",
foerdermengen = "<export_dir>/2018-04-27 Rohwasser Bericht - Galeriefördermengen.xlsx",
parameters = "<export_dir_meta>/2018-06-01 Lab Parameter.xlsx",
lookup_para = "<export_dir_meta>/lookup_para.csv",
sites = "<export_dir_meta>/Info-Altdaten.xlsx"
)
paths <- kwb.utils::resolve(paths, root = "root_server")
#paths <- kwb.utils::resolve(paths, root = "C:/projects")
2.2 Check input directory
Check if input directory selected above exists:
# Check if input directory exists
input_dir <- kwb.utils::safePath(kwb.utils::selectElements(paths, "input_dir"))
2.3 Check export directory
Check and in case the export directory is not available create it:
# Check if exists and if not create it
export_dir <- paths %>%
kwb.utils::selectElements("export_dir") %>%
kwb.utils::createDirectory() %>%
kwb.utils::safePath()
## Convert xls to xlsx files
kwb.geosalz::convert_xls_as_xlsx(input_dir, export_dir)
## Copy xlsx files
kwb.geosalz::copy_xlsx_files(input_dir, export_dir, overwrite = TRUE)
3 Data import
3.1 Categorise the Excel Files
Select files if they consist of:
- Header 1 (meta): with manually added metadata in an extra sheet
- Header 1: single header tables
- Header 2: two-row header tables or
- Header 4: four row header tables
# Get all xlsx files to be imported
files <- dir(export_dir, ".xlsx", recursive = TRUE, full.names = TRUE)
files_meta <- c(
"Meta Info",
"Header ident",
"Parameter ident",
"Parameter",
"Info-Altdaten",
"Brandenburg_Parameter_BWB_Stolpe",
"Kopie von Brandenburg_Parameter_BWB_Stolpe",
"2005-10BeschilderungProbenahmestellenGWWIII",
"Bezeichnungen der Reinwasserstellen",
"ReinwasserNomenklatur",
"Info zu Altdaten 1970-1998",
"2018-06-01 Lab Parameter"
)
files_header1_meta <- c(
"FRI_Br_GAL_C_Einzelparameter",
"FRI_Roh_Rein_NH4+NO3_2001-2003",
"MTBE_2003-11_2004",
"Reinwasser_2003_Fe_Mn", ## unclean
"VC_CN_in Brunnen bis Aug_2005 ", ## unclean
"Wuhlheide_Beelitzhof_Teildaten" ## unclean
)
files_header1 <- c(
"2018-04-11 Chlorid in Brunnen - Übersicht",
"2018-04-27 LIMS Reiw & Rohw Sammel ",
"2018-04-27 Rohwasser Bericht - Galeriefördermengen"
)
files_header4 <- c(
"STO Rohw_1999-6_2004",
"Wuhlheide_1999-2003_Okt - Neu",
"KAU_1999-Okt2003"
)
files_archive <- "Siebert"
files_to_ignore <- c(
files_meta, files_header1, files_header1_meta,
files_header4, files_archive
)
in_files_to_ignore <- kwb.utils::removeExtension(basename(files)) %in%
files_to_ignore
filepaths_header2 <- files[!in_files_to_ignore]
3.2 Header 1 (with metadata)
cond1 <- kwb.utils::removeExtension(basename(files)) %in% files_header1_meta
filepaths_header1_meta <- files[cond1]
labor_list_1meta <- kwb.geosalz::import_labor(
filepaths_header1_meta ,
export_dir,
func = kwb.geosalz::read_bwb_header1_meta
)
has_errors <- sapply(labor_list_1meta, kwb.utils::isTryError)
#has_errors
labor_df_1meta <- data.table::rbindlist(
l = labor_list_1meta[!has_errors],
fill = TRUE
)
3.3 Header 2
labor_header2_list <- kwb.geosalz::import_labor(
files = filepaths_header2,
export_dir = export_dir,
func = kwb.geosalz::read_bwb_header2
)
has_errors <- sapply(labor_header2_list, kwb.utils::isTryError)
#has_errors
labor_header2_df <- data.table::rbindlist(
l = labor_header2_list[!has_errors],
fill = TRUE
)
3.4 Header 4
cond3 <- kwb.utils::removeExtension(basename(files)) %in% files_header4
filepaths_header4 <- files[cond3]
labor_header4_list <- kwb.geosalz::import_labor(
files = filepaths_header4,
export_dir = export_dir,
func = kwb.geosalz::read_bwb_header4
)
has_errors <- sapply(labor_header4_list, inherits, "try-error")
has_errors
labor_header4_df <- data.table::rbindlist(
l = labor_header4_list[!has_errors],
fill = TRUE
)
4 Data cleaning
4.2 Filtering
labor_all <- labor_all %>%
dplyr::filter(!is.na(.data$DataValue)) %>%
dplyr::mutate(
Date = dplyr::if_else(
condition = !is.na(.data$Datum),
true = .data$Datum,
false = .data$Probenahme
)
) %>%
### Some "Datum" rentries are missing in;
### K-TL_LSW-Altdaten-Werke Teil 1\Werke Teil 1\Kaulsdorf\KAU_1999-Okt2003.xlsx
### sheets: 66 KAU Rein 1999-2000, 65 KAU NordSüd 1999-2000
dplyr::filter(!is.na(.data$Date)) %>%
dplyr::filter(!is.na(.data$VariableName_org))
nrow(labor_all)
4.3 Reducing dataset by adding metadata
labordaten_ww <- kwb.geosalz::add_para_metadata(
df = labor_all,
lookup_para_path = paths$lookup_para,
parameters_path = paths$parameters
)
labordaten_ww <- kwb.geosalz::add_site_metadata(
df = labordaten_ww,
site_path = paths$sites
) %>%
dplyr::mutate(
year = as.numeric(format(.data$Date,format = '%Y')),
DataValue = as.numeric(.data$DataValue)
)
nrow(labordaten_ww)
5. Exporting cleaned dataset
fs::dir_create(paths$cleaned_data_dir, recursive = TRUE)
print(sprintf("Export cleaned data to: %s", paths$cleaned_data_dir))
foerdermengen_ww <- kwb.geosalz::get_foerdermengen(paths$foerdermengen)
save(
labordaten_ww,
foerdermengen_ww,
file = file.path(paths$cleaned_data_dir, "cleaned-data.Rds")
)
write.csv2(
labordaten_ww,
file.path(paths$cleaned_data_dir, "labordaten_ww.csv"),
row.names = FALSE
)
write.csv2(
foerdermengen_ww,
file = file.path(paths$cleaned_data_dir, "foerdermengen_ww.csv"),
row.names = FALSE
)
6. Visualisation
library(ggplot2)
fs::dir_create(paths$figures_dir, recursive = TRUE)
print(sprintf("Export figures/plots to: %s", paths$figures_dir))
para_info <- kwb.geosalz::get_parameters_meta(paths$parameters)
water_types <- c("Reinwasser", "Rohwasser")
plot_to_pdf <- function(pdf_file, labordaten_ww, para_info, water_type, paths)
{
pdf(file = pdf_file, width = 14, height = 9)
on.exit(dev.off())
for (sel_para_id in unique(labordaten_ww$para_id)) {
my_selection <- sprintf(
"%s (%s)",
para_info$para_kurzname[para_info$para_id == sel_para_id],
water_type
)
tmp <- labordaten_ww %>%
dplyr::filter(prufgegenstand == water_type) %>%
dplyr::filter(para_id == sel_para_id) %>%
dplyr::group_by(.data$para_kurzname, .data$werk, .data$year) %>%
dplyr::summarise(
mean_DataValue = mean(as.numeric(.data$DataValue), na.rm = TRUE)
) %>%
dplyr::filter(!is.na(.data$werk)) %>%
dplyr::left_join(
y = kwb.geosalz::get_foerdermengen(paths$foerdermengen),
by = c("werk", "year")
)
if (nrow(tmp) > 0) {
cat(sprintf("for %s\n", my_selection))
g <- ggplot2::ggplot(tmp, mapping = ggplot2::aes_string(
x = "year",
y = "mean_DataValue",
col = "werk"
)) +
ggplot2::geom_point() +
ggplot2::geom_line() +
ggplot2::theme_bw() +
ggplot2::ggtitle(label = my_selection) +
ggplot2::labs(x = "", y = "Jahresmittelwert")
print(g)
} else {
cat(sprintf("not data availabe for %s\n", my_selection))
}
}
}
for (water_type in water_types) {
pdf_file <- file.path(
paths$figures_dir,
sprintf(
"Zeitreihen_Jahresmittelwerte_Werke_%s.pdf",
water_type
)
)
cat(sprintf("Creating plot:\n%s\n", pdf_file))
plot_to_pdf(pdf_file, labordaten_ww, para_info, water_type, paths)
}
7 Session Info
Plattform
name | value |
---|---|
version | R version 4.4.2 (2024-10-31 ucrt) |
os | Windows Server 2022 x64 (build 20348) |
system | x86_64, mingw32 |
ui | RTerm |
language | en |
collate | English_United States.utf8 |
ctype | English_United States.utf8 |
tz | UTC |
date | 2024-12-23 |
pandoc | 3.1.11 @ C:/HOSTED1/windows/pandoc/31F3871.11/x64/PANDOC~1.11/ (via rmarkdown) |
Packages
#> package * version date (UTC) lib source
#> bslib 0.8.0 2024-07-29 [1] CRAN (R 4.4.1)
#> cachem 1.1.0 2024-05-16 [1] CRAN (R 4.4.0)
#> cli 3.6.3 2024-06-21 [1] RSPM (R 4.4.0)
#> desc 1.4.3 2023-12-10 [1] CRAN (R 4.4.0)
#> digest 0.6.37 2024-08-19 [1] RSPM (R 4.4.0)
#> dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.4.0)
#> evaluate 1.0.1 2024-10-10 [1] CRAN (R 4.4.1)
#> fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.4.0)
#> fs 1.6.5 2024-10-30 [1] CRAN (R 4.4.1)
#> generics 0.1.3 2022-07-05 [1] CRAN (R 4.4.0)
#> glue 1.8.0 2024-09-30 [1] CRAN (R 4.4.1)
#> htmltools 0.5.8.1 2024-04-04 [1] CRAN (R 4.4.0)
#> htmlwidgets 1.6.4 2023-12-06 [1] CRAN (R 4.4.0)
#> jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.4.0)
#> jsonlite 1.8.9 2024-09-20 [1] RSPM (R 4.4.0)
#> knitr 1.49 2024-11-08 [1] RSPM (R 4.4.0)
#> kwb.utils 0.15.0 2024-04-25 [1] Github (kwb-r/kwb.utils@4415aa2)
#> lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.4.0)
#> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.4.0)
#> pillar 1.10.0 2024-12-17 [1] CRAN (R 4.4.2)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.4.0)
#> pkgdown 2.1.1 2024-09-17 [1] RSPM (R 4.4.0)
#> purrr 1.0.2 2023-08-10 [1] CRAN (R 4.4.0)
#> R6 2.5.1 2021-08-19 [1] CRAN (R 4.4.0)
#> ragg 1.3.0 2024-03-13 [1] CRAN (R 4.4.0)
#> rlang 1.1.4 2024-06-04 [1] CRAN (R 4.4.0)
#> rmarkdown 2.29 2024-11-04 [1] CRAN (R 4.4.2)
#> sass 0.4.9 2024-03-15 [1] CRAN (R 4.4.0)
#> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.4.0)
#> systemfonts 1.1.0 2024-05-15 [1] CRAN (R 4.4.0)
#> textshaping 0.3.7 2023-10-09 [1] CRAN (R 4.4.0)
#> tibble 3.2.1 2023-03-20 [1] CRAN (R 4.4.0)
#> tidyr * 1.3.1 2024-01-24 [1] CRAN (R 4.4.0)
#> tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.4.0)
#> vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.4.0)
#> xfun 0.49 2024-10-31 [1] CRAN (R 4.4.1)
#> yaml 2.3.10 2024-07-26 [1] RSPM (R 4.4.0)
#>
#> [1] D:/a/_temp/Library
#> [2] C:/R/library