library(worrms)
cleaned_fish <- raw_fish_data %>%
# exclude those without sample number
filter(`Sample number` != "/") %>%
rename(
family = Family,
preparations = presrvation,
lifeStage = C1,
materialEntityID = `Sample number`,
netSize = `Net Size`
) %>%
mutate(
station_number_cleaned = str_remove_all(`Station number`, "\\s*"), # Remove white spaces
eventID = str_c("BROKE_WEST_RMT_", station_number_cleaned, "_RMT", `netSize`),
disposition = case_when(
Location == "Anton" ~ "pending accession into Institue of Natural Sciences' collection",
# not sure where the samples of Evgeny and Megan go
TRUE ~ ""),
verbatimIdentification = case_when(
Species == "Unidentified" ~ "",
Species == "/" ~ "",
TRUE ~ Species),
scientificName = case_when(
grepl(" sp\\.$", verbatimIdentification) ~ sub(" sp\\.$", "", verbatimIdentification),
# correct typo of scientificName based on WoRMS
verbatimIdentification == "Protomytophum bollini" ~ "Protomyctophum bolini",
verbatimIdentification == "Artedidraco scottsbergi" ~ "Artedidraco skottsbergi",
verbatimIdentification == "Gymnoscophelus braueri" ~ "Gymnoscopelus braueri",
# Unidentified == Biota because lifeStage = larvae for these records. Can we be sure that these are fish?
Species == "Unidentified" ~ "Biota",
Species == "/" ~ "",
TRUE ~ verbatimIdentification
)
# no more identificationQualifier, use verbatimIdentification :)
)
# match unique scientificName to WoRMS to obtain LSID of AphiaID
matched_taxa <- wm_records_names(unique(cleaned_fish$scientificName)) %>%
# the taxon match returns list of lists, this function creates a table of the following fields out of the lists
# keep aphiaID for surveyTargetID
map_df(~ select(.x, scientificname, lsid, rank, kingdom, AphiaID)) %>%
rename(scientificName = scientificname, scientificNameID = lsid, taxonRank = rank)
# join table to get scientificNameID and taxonRank
cleaned_fish <- cleaned_fish %>%
left_join(matched_taxa, by = "scientificName")
# preview distinct verbatimIdentification and matched scientificName and scientificNameID from WoRMS
cleaned_fish %>% distinct(verbatimIdentification, scientificName, scientificNameID, kingdom)Transform dataset into DwC-DP
Darwin Core Data Package
The mapping is based on schema from https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/
Clean fish data
Clean the fish data so that the fields are ready for use for further manipulation.
Main tables
Agent
agent <- agent %>% filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/agent.json")Agent agent role
agent_agent_role <- agent_agent_role %>% filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/agent-agent-role.json")Bibliographic Resource
bibliographic_resource <- bibliographic_resource %>% filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/bibliographic-resource.json")Protocol Reference
Needs to come first before Protocol because otherwise filter_dataframe_by_json will remove referenceID from Protocol table.
protocol_reference <- protocol %>%
select(protocolID, referenceID) %>%
filter(!is.na(referenceID)) %>% # remove rows without referenceID
filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/protocol-reference.json")Protocol
protocol <- filter_dataframe_by_json(protocol, "https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/protocol.json")
protocolProvenance
provenance <- filter_dataframe_by_json(provenance, "https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/provenance.json")Usage Policy
usage_policy <- tibble(
usagePolicyID = "CC0",
rights = "Public Domain",
rightsIRI = "https://creativecommons.org/public-domain/cc0/",
license = "https://creativecommons.org/public-domain/cc0/",
credit = "Photo taken by Anton Van de Putte (CC0)",
) %>% filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/usage-policy.json")Event
How do we determine what eventType to use? e.g. an Event can be both Survey and MaterialGathering.
# Create subevents with net because it is the lowest event with organisms
subevents <- event_google %>%
filter(eventID != "BROKE_WEST") %>% # Exclude the main event
mutate(
eventType = "Survey - device subunit") %>% # Set eventType for subevents
expand_grid(suffix = c("_RMT1", "_RMT8")) %>% # Create subevent suffixes with net opening
mutate(
eventID = paste0(eventID, suffix), # Append suffix to eventID
parentEventID = eventID %>% sub("_RMT[18]$", "", .) # Assign correct parent
) %>%
select(-suffix) # Remove the suffix column
# Combine original and subevents, the sampleSizeValue and sampleSizeUnit are only available for RMT8
event <- bind_rows(event_google, subevents) %>%
mutate(
sampleSizeValue = case_when(
str_detect(eventID, "_RMT8") ~ sampleSizeValue,
TRUE ~ NA
),
sampleSizeUnit = case_when(
str_detect(eventID, "_RMT8") ~ sampleSizeUnit,
TRUE ~ NA)
)
event %>% filter(str_starts(eventID, "BROKE_WEST_RMT_001")) %>% select(eventID, parentEventID, eventType)# will only filter for event fields based on DwC-DP schema later since we still need to join Survey table as sampleSizeValue and sampleSizeUnit are now under Survey class.Survey
Survey table is a table that contains current Humboldt Extension terms.
suffixes <- c("_RMT1","_RMT8")
survey_w_protocol <- humboldt %>%
filter(eventID != "BROKE_WEST") %>% # remove the expedition event, only link humboldt to the lowest level of Survey Events
expand_grid(suffix = suffixes) %>%
mutate(eventID = paste0(eventID, suffix),
# need to cast netSize to string so that it can be joined later (so that it is not a double)
netSize = case_when(suffix == "_RMT1" ~ "1", suffix == "_RMT8" ~ "8", TRUE ~ NA),
surveyID = eventID,
isLeastSpecificTargetCategoryQuantityInclusive = "true")
# join survey_w_protocol with event table to get sampleSizeValue and sampleSizeUnit
survey_w_protocol_sample_size <- event %>%
select(eventID, sampleSizeValue, sampleSizeUnit) %>%
right_join(survey_w_protocol, by = "eventID")
survey <- survey_w_protocol_sample_size %>%
filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/survey.json")
head(survey)Survey Target
After extensive discussions on what target is, we decided to create survey-target table retrospectively because the objective of the survey is to quantify the biodiversity at the sites surveyed. SurveyTarget needs to be created per net size because it is linked to a Survey (lowest level of Survey is per net size) via SurveyID.
Limitation: There is only total count for fish from RMT8 and not RMT1. The total count also does not contain any
sexandlifeStagefor all the taxa except Electrona antarctica.
routine_trawl <- transform_rmt_count_to_long(raw_count_routine)
target_trawl <- transform_rmt_count_to_long(raw_count_target)
survey_target <- rbind(routine_trawl$survey_target, target_trawl$survey_target) %>%
mutate(includeOrExclude = "include", isSurveyTargetFullyReported = "true") %>%
filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/survey-target.json")
# preview example tables
head(survey_target)Occurrence
occurrence <- rbind(routine_trawl$occurrence, target_trawl$occurrence) %>%
mutate(occurrenceID = surveyTargetID,
organismQuantityType = case_when(organismQuantity <= 1 ~ "individual",
organismQuantity > 1 ~ "individuals")) %>%
filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/occurrence.json")
head(occurrence)Material
Material includes:
- preserved individual fish
- stomach content of individual fish
individual_fish <- cleaned_fish %>%
mutate(
materialEntityType = "whole organism",
materialCategory = "preserved",
collectedBy = "Anton Van de Putte",
collectedByID = "https://orcid.org/0000-0003-1336-5554",
identifiedBy = "Anton Van de Putte",
identifiedByID = "https://orcid.org/0000-0003-1336-5554",
identificationRemarks = case_when(
str_detect(COMMENTS, "closely resembles") ~ COMMENTS,
str_detect(COMMENTS, "Protomyctophum") ~ COMMENTS,
TRUE ~ ""
),
materialEntityRemarks = case_when(
str_detect(COMMENTS, "BROKEN | Check | Microsattelite") ~ COMMENTS,
TRUE ~ ""
)
) %>%
select(
materialEntityID, eventID, materialCategory, materialEntityType, preparations, disposition, collectedBy, collectedByID, identifiedBy, identifiedByID, verbatimIdentification, scientificName, scientificNameID, kingdom, taxonRank, identificationRemarks, materialEntityRemarks, AphiaID, lifeStage
) %>%
rename(taxonID = scientificNameID) # keep verbatimIdentification, scientificName, scientificNameID, kingdom, taxonRank to construct Identification table
stomach <- raw_stomach %>%
rename(stomachItem = `Stomach Item`) %>%
mutate(
seqNum = sprintf("%03d", row_number()), # zero-padded sequential number
materialEntityID = str_c(`Fish ID`, "_stomach_", seqNum, "_", stomachItem),
materialEntityType = case_when(
stomachItem == "St wall" ~ "stomachContent - st wall",
stomachItem == "Mucus" ~ "stomachContent - mucus",
stomachItem == "Other" ~ "stomachContent - other",
stomachItem == "Facet eye" ~ "stomachContent - facet eye",
stomachItem == "/" ~ "stomachContent - empty",
TRUE ~ "stomachContent"),
derivedFromMaterialEntityID = `Fish ID`,
materialCategory = "biological",
derivationType = "stomach content of",
identifiedBy = "Anton Van de Putte",
identifiedByID = "https://orcid.org/0000-0003-1336-5554",
verbatimIdentification = case_when(
str_detect(stomachItem, "St wall|Mucus|Other|Facet eye|\\/") ~ "",
TRUE ~ stomachItem
),
identificationRemarks = case_when(str_detect(Remarks, "cf.") ~ Remarks, TRUE ~ ""),
materialEntityRemarks = case_when(str_detect(Remarks, "cf.") ~ "", TRUE ~ Remarks),
) %>%
left_join(individual_fish, by = c("derivedFromMaterialEntityID" = "materialEntityID")) %>%
# because left_join renamed the following columns since it is present in both individual_fish and stomach table
rename(materialEntityType = materialEntityType.x,
verbatimIdentification = verbatimIdentification.x,
materialEntityRemarks = materialEntityRemarks.x,
identificationRemarks = identificationRemarks.x,
identifiedBy = identifiedBy.x,
identifiedByID = identifiedByID.x)
# combine both tables into a Material table
# material_join_identification is needed for Identification table later
material_join_identification <- bind_rows(individual_fish, stomach) %>%
mutate(evidenceForOccurrenceID = case_when(
lifeStage != "" ~ str_c(eventID, AphiaID, lifeStage, sep = "_"),
TRUE ~ str_c(eventID, AphiaID, sep = "_"))) %>%
mutate(
evidenceForOccurrenceID = if_else(
evidenceForOccurrenceID %in% occurrence$occurrenceID,
evidenceForOccurrenceID,
str_c(eventID, AphiaID, sep = "_") # rebuild without lifeStage
),
evidenceForOccurrenceID = if_else(
str_detect(eventID, "RMT1"), # because there is no occurrence record for RMT1
"",
evidenceForOccurrenceID
)
)
material <- material_join_identification %>%
filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/material.json")
# preview record
material %>% filter(str_starts(materialEntityID, "AAV3FF_00337"))Identification
fish_id_guide <- "https://archive.org/details/fishesofsouthern00gono | Kellermann, A. (1990): Identification Key and Catalogue of Larval Antarctic Fishes , Berichte zur Polarforschung (Reports on Polar Research), Bremerhaven, Alfred Wegener Institute for Polar and Marine Research, 67 , 136 p. . doi: 10.2312/BzP_0067_1990 | https://archive.org/details/cybium-7-002-003-074"
identification <- material_join_identification %>%
mutate(
identificationID = paste(materialEntityID, "ID", sep = "_"),
basedOnMaterialEntityID = materialEntityID,
identificationType = "MaterialEntity",
identifiedBy = "Anton Van de Putte",
identifiedByID = "https://orcid.org/0000-0003-1336-5554",
identificationReferences = fish_id_guide,
higherClassificationName = if_else(!is.na(kingdom) & kingdom != "", kingdom, NA_character_),
higherClassificationRank = if_else(!is.na(kingdom) & kingdom != "", "kingdom", NA_character_)) %>%
filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/identification.json")
head(identification)Media
media_material <- media_google %>%
# also needed to construct Material Media table later
left_join(material, by = "materialEntityID") %>%
# remove these media records because there is no data associated with these pictures -> we don't know what these are
filter(!is.na(eventID)) %>%
mutate(
description = str_c("Photo of ", scientificName, " with its label and a grid."))
media <- filter_dataframe_by_json(media_material, "https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/media.json")
head(media)Joined tables
Assertion
Event Assertion
Placing environmental measurements under Event Assertion instead of Survey Assertion because it is for both RMT1 and RMT8. Putting this at the level of station Event and not net Event.
Expedition
|__ Sampling station
|__ RMT1
|__ RMT8
assertion <- emof %>%
rename(assertionID = measurementID, assertionType = measurementType, assertionTypeIRI = measurementTypeID, assertionValueIRI = measurementValueID, assertionUnit = measurementUnit, assertionUnitIRI = measurementUnitID) %>%
mutate(assertionValue = if_else(grepl("[a-zA-Z]", measurementValue), measurementValue, NA),
assertionValueNumeric = if_else(!grepl("[a-zA-Z]", measurementValue), as.numeric(measurementValue), NA))
event_assertion <- assertion %>%
filter(is.na(occurrenceID)) %>%
mutate(
eventID = case_when(
# because sampled volume is only available for RMT8
assertionType == "Sampled volume" ~ str_c(eventID, "_RMT8"),
TRUE ~ eventID
),
assertionID = case_when(
assertionType == "Sampled volume" ~ str_c(eventID, "_RMT8_", assertionType),
TRUE ~ assertionID
),
assertionTypeVocabulary = "http://vocab.nerc.ac.uk/collection/P01/current/",
assertionProtocols = case_when(
assertionType %in% c("Average temperature", "Maximum Pressure", "Average Salinity", "Average Conductivity", "Average fluoresence") ~ "CTD",
assertionType %in% c("Solar Azimuth", "Solar elevation", "daylength") ~ "solar azimuth, elevation and day length",
TRUE ~ ""),
assertionProtocolID = case_when(
assertionProtocols == "CTD" ~ "ctd",
assertionProtocols == "solar azimuth, elevation and day length" ~ "light_conditions",
TRUE ~ ""),
assertionUnitVocabulary = "http://vocab.nerc.ac.uk/collection/P06/current/"
) %>%
filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/event-assertion.json")
event_assertion %>% filter(assertionID == "BROKE_WEST_RMT_028_temp")Material Assertion
fish_length_assertion <- assertion %>%
filter(!is.na(occurrenceID)) %>%
rename(materialEntityID = occurrenceID) %>%
filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/material-assertion.json")
fish_lifestage_assertion <- cleaned_fish %>%
filter(!is.na(lifeStage)) %>%
select(materialEntityID, lifeStage) %>%
rename(assertionValue = lifeStage) %>%
mutate(assertionID = str_c(materialEntityID, "_lifeStage"),
assertionType = "life stage",
assertionTypeIRI = "http://rs.tdwg.org/dwc/terms/lifeStage",
assertionValueIRI = case_when(
assertionValue == "Larvae" ~ "https://vocab.nerc.ac.uk/collection/S11/current/S1128/",
TRUE ~ NA
),
assertionUnit = NA,
assertionUnitIRI = NA,
assertionValueNumeric = NA)
stomach_assertion <- stomach %>%
select(materialEntityID, DoD, No) %>%
pivot_longer(
cols = c(DoD, No),
names_to = "assertionType",
values_to = "assertionValueNumeric",
values_drop_na = TRUE
) %>%
mutate(
assertionType = recode(assertionType,
DoD = "degree of digestion",
No = "individual count"),
assertionID = case_when(
assertionType == "degree of digestion" ~ str_c(materialEntityID, "_digestion"),
assertionType == "individual count" ~ str_c(materialEntityID, "_count")),
assertionTypeIRI = "",
assertionValueIRI = "",
assertionUnit = "",
assertionUnitIRI = "",
assertionValue = ""
)
material_assertion <- rbind(fish_length_assertion, fish_lifestage_assertion, stomach_assertion) %>%
filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/material-assertion.json")
material_assertion %>% filter(materialEntityID == "AAV3FF_00337")Provenance
Event Provenance
event_provenance <- crossing(
provenance %>% select(provenanceID),
event %>% select(eventID)
)
head(event_provenance)Material Provenance
material_provenance <- crossing(
provenance %>% select(provenanceID),
material %>% select(materialEntityID)
)
head(material_provenance)Media Provenance
media_provenance <- crossing(
provenance %>% select(provenanceID),
media %>% select(mediaID)
)
head(media_provenance)Usage Policy
Media Usage Policy
media_usage_policy <- crossing(
usage_policy %>% select(usagePolicyID),
media %>% select(mediaID)
)
head(media_usage_policy)Media
Material Media
material_media <- media_material %>%
mutate(mediaSubjectCategory = "specimen photo") %>%
filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/material-media.json")
head(material_media)Protocol
Survey Protocol
As Event is also a Survey, I use Survey Protocol table instead of Event Protocol table.
survey_protocol <- survey_w_protocol %>%
select(surveyID, samplingProtocol) %>%
rename(protocolID = samplingProtocol) %>%
filter(!is.na(protocolID))
survey_protocol %>% filter(surveyID == "BROKE_WEST_RMT_001_RMT8")Identifier
Agent Identifier
agent_identifier <- agent_identifier %>% filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/agent-identifier.json")
agent_identifier