library(worrms)
<- raw_fish_data %>%
cleaned_fish # exclude those without sample number
filter(`Sample number` != "/") %>%
rename(
family = Family,
preparations = presrvation,
lifeStage = C1,
materialEntityID = `Sample number`,
netSize = `Net Size`
%>%
) mutate(
station_number_cleaned = str_remove_all(`Station number`, "\\s*"), # Remove white spaces
eventID = str_c("BROKE_WEST_RMT_", station_number_cleaned, "_RMT", `netSize`),
disposition = case_when(
== "Anton" ~ "pending accession into Institue of Natural Sciences' collection",
Location # not sure where the samples of Evgeny and Megan go
TRUE ~ ""),
verbatimIdentification = case_when(
== "Unidentified" ~ "",
Species == "/" ~ "",
Species TRUE ~ Species),
scientificName = case_when(
grepl(" sp\\.$", verbatimIdentification) ~ sub(" sp\\.$", "", verbatimIdentification),
# correct typo of scientificName based on WoRMS
== "Protomytophum bollini" ~ "Protomyctophum bolini",
verbatimIdentification == "Artedidraco scottsbergi" ~ "Artedidraco skottsbergi",
verbatimIdentification == "Gymnoscophelus braueri" ~ "Gymnoscopelus braueri",
verbatimIdentification # Unidentified == Biota because lifeStage = larvae for these records. Can we be sure that these are fish?
== "Unidentified" ~ "Biota",
Species == "/" ~ "",
Species TRUE ~ verbatimIdentification
)# no more identificationQualifier, use verbatimIdentification :)
)
# match unique scientificName to WoRMS to obtain LSID of AphiaID
<- wm_records_names(unique(cleaned_fish$scientificName)) %>%
matched_taxa # the taxon match returns list of lists, this function creates a table of the following fields out of the lists
# keep aphiaID for surveyTargetID
map_df(~ select(.x, scientificname, lsid, rank, kingdom, AphiaID)) %>%
rename(scientificName = scientificname, scientificNameID = lsid, taxonRank = rank)
# join table to get scientificNameID and taxonRank
<- cleaned_fish %>%
cleaned_fish left_join(matched_taxa, by = "scientificName")
# preview distinct verbatimIdentification and matched scientificName and scientificNameID from WoRMS
%>% distinct(verbatimIdentification, scientificName, scientificNameID, kingdom) cleaned_fish
Transform dataset into DwC-DP
Darwin Core Data Package
The mapping is based on schema from https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/
Clean fish data
Clean the fish data so that the fields are ready for use for further manipulation.
Main tables
Agent
There is only Anton here because this version of dataset does not contain squid records (from the catch).
<- tibble(
agent agentID = "https://orcid.org/0000-0003-1336-5554",
agentType = "person",
preferredAgentName = "Anton Van de Putte"
%>% filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/agent.json") )
Event
How do we determine what eventType
to use? e.g. an Event can be both Survey
and MaterialGathering
.
# Create subevents with net because it is the lowest event with organisms
<- event_google %>%
subevents filter(eventID != "BROKE_WEST") %>% # Exclude the main event
mutate(eventType = "Survey - net type") %>% # Set eventType for subevents
expand_grid(suffix = c("_RMT1", "_RMT8")) %>% # Create subevent suffixes with net size
mutate(
eventID = paste0(eventID, suffix), # Append suffix to eventID
parentEventID = eventID %>% sub("_RMT[18]$", "", .) # Assign correct parent
%>%
) select(-suffix) # Remove the suffix column
# Combine original and subevents
<- bind_rows(event_google, subevents) %>%
event mutate(
sampleSizeValue = case_when(
str_detect(eventID, "_RMT8") ~ sampleSizeValue,
TRUE ~ NA
),sampleSizeUnit = case_when(
str_detect(eventID, "_RMT8") ~ sampleSizeUnit,
TRUE ~ NA)
%>%
) filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/event.json")
%>% filter(str_starts(eventID, "BROKE_WEST_RMT_001")) %>% select(eventID, parentEventID, eventType) event
Material
Material includes:
- preserved individual fish
- stomach content of individual fish
<- cleaned_fish %>%
individual_fish mutate(
materialEntityType = "whole organism",
collectedBy = "Anton Van de Putte",
collectedByID = "https://orcid.org/0000-0003-1336-5554",
identifiedBy = "Anton Van de Putte",
identifiedByID = "https://orcid.org/0000-0003-1336-5554",
identificationRemarks = case_when(
str_detect(COMMENTS, "closely resembles") ~ COMMENTS,
str_detect(COMMENTS, "Protomyctophum") ~ COMMENTS,
TRUE ~ ""
),materialEntityRemarks = case_when(
str_detect(COMMENTS, "BROKEN | Check | Microsattelite") ~ COMMENTS,
TRUE ~ ""
)%>%
) select(
materialEntityID, eventID, materialEntityType, preparations, disposition, collectedBy, collectedByID, identifiedBy, identifiedByID, verbatimIdentification, scientificName, scientificNameID, kingdom, taxonRank, identificationRemarks, materialEntityRemarks%>% rename(taxonID = scientificNameID) # keep verbatimIdentification, scientificName, scientificNameID, kingdom, taxonRank to construct Identification table
)
<- raw_stomach %>%
stomach rename(stomachItem = `Stomach Item`) %>%
mutate(
seqNum = sprintf("%03d", row_number()), # zero-padded sequential number
materialEntityID = str_c(`Fish ID`, "_stomach_", seqNum, "_", stomachItem),
materialEntityType = case_when(
== "St wall" ~ "stomach content - st wall",
stomachItem == "Mucus" ~ "stomach content - mucus",
stomachItem == "Other" ~ "stomach content - other",
stomachItem == "Facet eye" ~ "stomach content - facet eye",
stomachItem == "/" ~ "stomach content - empty",
stomachItem TRUE ~ "stomach content"),
derivedFromMaterialEntityID = `Fish ID`,
derivationType = "stomach content of",
partOfMaterialEntityID = `Fish ID`,
identifiedBy = "Anton Van de Putte",
identifiedByID = "https://orcid.org/0000-0003-1336-5554",
verbatimIdentification = case_when(
str_detect(stomachItem, "St wall|Mucus|Other|Facet eye|\\/") ~ "",
TRUE ~ stomachItem
),identificationRemarks = case_when(str_detect(Remarks, "cf.") ~ Remarks, TRUE ~ ""),
materialEntityRemarks = case_when(str_detect(Remarks, "cf.") ~ "", TRUE ~ Remarks),
%>%
) left_join(individual_fish, by = c("derivedFromMaterialEntityID" = "materialEntityID")) %>%
# because left_join renamed the following columns since it is present in both individual_fish and stomach table
rename(materialEntityType = materialEntityType.x,
verbatimIdentification = verbatimIdentification.x,
materialEntityRemarks = materialEntityRemarks.x,
identificationRemarks = identificationRemarks.x,
identifiedBy = identifiedBy.x,
identifiedByID = identifiedByID.x) %>%
select(materialEntityID, eventID, materialEntityType, identifiedBy, identifiedByID, verbatimIdentification, identificationRemarks, derivedFromMaterialEntityID, derivationType, partOfMaterialEntityID, materialEntityRemarks)
# combine both tables into a Material table
<- bind_rows(individual_fish, stomach)
material_join_identification
# preview example of a full fish + its stomach content in Material table
<- material_join_identification %>%
material mutate(materialCategory = "preserved") %>%
filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/material.json")
%>% filter(str_starts(materialEntityID, "AAV3FF_00337")) material
Identification
Identification terms are mostly part of Material table.
Question: I am expecting OBIS to ask about why taxonID
and not scientificNameID
.
<- "https://archive.org/details/fishesofsouthern00gono | Kellermann, A. (1990): Identification Key and Catalogue of Larval Antarctic Fishes , Berichte zur Polarforschung (Reports on Polar Research), Bremerhaven, Alfred Wegener Institute for Polar and Marine Research, 67 , 136 p. . doi: 10.2312/BzP_0067_1990 | https://archive.org/details/cybium-7-002-003-074"
fish_id_guide
<- material_join_identification %>%
identification mutate(
identificationID = paste(materialEntityID, "ID", sep = "_"),
identificationBasedOnMaterialEntityID = materialEntityID,
identificationType = "MaterialEntity",
identificationTypeIRI = "http://rs.tdwg.org/dwc/terms/MaterialEntity",
identifiedBy = "Anton Van de Putte",
identifiedByID = "https://orcid.org/0000-0003-1336-5554",
identificationReferences = fish_id_guide) %>%
filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/identification.json")
head(identification)
Media
<- media_google %>%
media_material # also needed to construct Material Media table later
left_join(material, by = "materialEntityID") %>%
# remove these media records because there is no data associated with these pictures -> we don't know what these are
filter(!is.na(eventID)) %>%
mutate(mediaDescription = str_c("Photo of ", scientificName, " with its label and a grid."))
<- filter_dataframe_by_json(media_material, "https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/media.json")
media head(media)
Survey
Survey table is a table that contains current Humboldt Extension terms.
<- c("_RMT1","_RMT8")
suffixes
<- humboldt %>%
survey_w_protocol filter(eventID != "BROKE_WEST") %>% # remove the expedition event, only link humboldt to the lowest level of Survey Events
expand_grid(suffix = suffixes) %>%
mutate(eventID = paste0(eventID, suffix),
# need to cast netSize to string so that it can be joined later (so that it is not a double)
netSize = case_when(suffix == "_RMT1" ~ "1", suffix == "_RMT8" ~ "8", TRUE ~ NA),
surveyID = eventID,
isLeastSpecificTargetCategoryQuantityInclusive = "true")
<- survey_w_protocol %>%
survey filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/survey.json")
head(survey)
Protocol
<- filter_dataframe_by_json(protocol, "https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/protocol.json")
protocol protocol
Survey Target & Occurrence
After extensive discussions on what target
is, we decided to create survey-target table retrospectively. SurveyTarget needs to be created per net size because it is linked to a Survey (lowest level of Survey is per net size) via SurveyID. SurveyTargetAbundance points to the Survey through the SurveyTarget record.
Challenge: There is only total count for fish from RMT8 and not RMT1. The total count also does not contain any sex
and lifeStage
for all the taxa except Electrona antarctica.
<- transform_rmt_count_to_long(raw_count_routine)
routine_trawl <- transform_rmt_count_to_long(raw_count_target)
target_trawl
<- rbind(routine_trawl$survey_target, target_trawl$survey_target) %>%
survey_target mutate(includeOrExclude = "include", isSurveyTargetFullyReported = "true") %>%
filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/survey-target.json")
<- rbind(routine_trawl$occurrence, target_trawl$occurrence) %>%
occurrence mutate(occurrenceID = str_c(eventID, surveyTargetID, sep = "_")) %>%
# filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/occurrence.json") # comment this out because surveyTargetID is not in the schema
select(occurrenceID, verbatimIdentification, organismQuantity, organismQuantityType, lifeStage, eventID, taxonID, taxonRank, kingdom, surveyTargetID, scientificName, occurrenceStatus, recordedBy, recordedByID, identifiedBy, identifiedByID)
# preview example tables
head(survey_target)
head(occurrence)
Common tables
Assertion
Event Assertion
Placing environmental measurements under Event Assertion instead of Survey Assertion because it is for both RMT1 and RMT8. Putting this at the level of station Event and not net Event.
Expedition
|__ Sampling station
|__ RMT1
|__ RMT8
<- emof %>%
assertion rename(assertionID = measurementID, assertionType = measurementType, assertionTypeIRI = measurementTypeID, assertionValueIRI = measurementValueID, assertionUnit = measurementUnit, assertionUnitIRI = measurementUnitID) %>%
mutate(assertionValue = if_else(grepl("[a-zA-Z]", measurementValue), measurementValue, NA),
assertionValueNumeric = if_else(!grepl("[a-zA-Z]", measurementValue), as.numeric(measurementValue), NA))
<- assertion %>%
event_assertion filter(is.na(occurrenceID)) %>%
mutate(
eventID = case_when(
# because sampled volume is only available for RMT8
== "Sampled volume" ~ str_c(eventID, "_RMT8"),
assertionType TRUE ~ eventID
),assertionID = case_when(
== "Sampled volume" ~ str_c(eventID, "_RMT8_", assertionType),
assertionType TRUE ~ assertionID
),assertionTypeVocabulary = "http://vocab.nerc.ac.uk/collection/P01/current/",
assertionProtocol = case_when(
%in% c("Average temperature", "Maximum Pressure", "Average Salinity", "Average Conductivity", "Average fluoresence") ~ "CTD",
assertionType %in% c("Solar Azimuth", "Solar elevation", "daylength") ~ "solar azimuth, elevation and day length",
assertionType TRUE ~ ""),
assertionProtocolID = case_when(
== "CTD" ~ "ctd",
assertionProtocol == "solar azimuth, elevation and day length" ~ "light_conditions",
assertionProtocol TRUE ~ ""),
assertionUnitVocabulary = "http://vocab.nerc.ac.uk/collection/P06/current/"
%>%
) filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/event-assertion.json")
%>% filter(assertionID == "BROKE_WEST_RMT_028_temp") event_assertion
Material Assertion
<- assertion %>%
fish_length_assertion filter(!is.na(occurrenceID)) %>%
rename(materialEntityID = occurrenceID) %>%
filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/material-assertion.json")
<- cleaned_fish %>%
fish_lifestage_assertion filter(!is.na(lifeStage)) %>%
select(materialEntityID, lifeStage) %>%
rename(assertionValue = lifeStage) %>%
mutate(assertionID = str_c(materialEntityID, "_lifeStage"),
assertionType = "life stage",
assertionTypeIRI = "http://rs.tdwg.org/dwc/terms/lifeStage",
assertionValueIRI = case_when(
== "Larvae" ~ "https://vocab.nerc.ac.uk/collection/S11/current/S1128/",
assertionValue TRUE ~ NA
),assertionUnit = NA,
assertionUnitIRI = NA,
assertionValueNumeric = NA)
<- rbind(fish_length_assertion, fish_lifestage_assertion) %>%
material_assertion filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/material-assertion.json")
%>% filter(materialEntityID == "AAV3FF_00337") material_assertion
Media
Material Media
<- media_material %>%
material_media mutate(mediaSubjectCategory = "specimen photo") %>%
filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/material-media.json")
head(material_media)
Protocol
Survey Protocol
As Event is also a Survey, I use Survey Protocol table instead of Event Protocol table.
<- survey_w_protocol %>%
survey_protocol select(surveyID, samplingProtocol) %>%
rename(protocolID = samplingProtocol) %>%
filter(!is.na(protocolID))
%>% filter(surveyID == "BROKE_WEST_RMT_001_RMT8") survey_protocol
Identifier
Agent Identifier
<- tibble(
agent_identifier identifier = "https://orcid.org/0000-0003-1336-5554",
agentID = "https://orcid.org/0000-0003-1336-5554",
identifierType = "ORCID",
identifierLanguage = "eng"
%>% filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/agent-identifier.json")
) agent_identifier