library(worrms)
<- raw_fish_data %>%
cleaned_fish # exclude those without sample number
filter(`Sample number` != "/") %>%
rename(
family = Family,
preparations = presrvation,
disposition = Location,
lifeStage = C1,
materialEntityID = `Sample number`,
netSize = `Net Size`
%>%
) mutate(
station_number_cleaned = str_remove_all(`Station number`, "\\s*"), # Remove white spaces
eventID = str_c("BROKE_WEST_RMT_", station_number_cleaned, "_RMT", `netSize`),
verbatimIdentification = case_when(
== "Unidentified" ~ "",
Species == "/" ~ "",
Species TRUE ~ Species),
scientificName = case_when(
grepl(" sp\\.$", verbatimIdentification) ~ sub(" sp\\.$", "", verbatimIdentification),
# correct typo of scientificName based on WoRMS
== "Protomytophum bollini" ~ "Protomyctophum bolini",
verbatimIdentification == "Artedidraco scottsbergi" ~ "Artedidraco skottsbergi",
verbatimIdentification == "Gymnoscophelus braueri" ~ "Gymnoscopelus braueri",
verbatimIdentification # Unidentified == Biota because lifeStage = larvae for these records. Can we be sure that these are fish?
== "Unidentified" ~ "Biota",
Species == "/" ~ "",
Species TRUE ~ verbatimIdentification
)# no more identificationQualifier, use verbatimIdentification :)
)
# match unique scientificName to WoRMS to obtain LSID of AphiaID
<- wm_records_names(unique(cleaned_fish$scientificName)) %>%
matched_taxa # the taxon match returns list of lists, this function creates a table of the following fields out of the lists
# keep aphiaID for surveyTargetID
map_df(~ select(.x, scientificname, lsid, rank, kingdom, AphiaID)) %>%
rename(scientificName = scientificname, scientificNameID = lsid, taxonRank = rank)
# join table to get scientificNameID and taxonRank
<- cleaned_fish %>%
cleaned_fish left_join(matched_taxa, by = "scientificName")
# preview distinct verbatimIdentification and matched scientificName and scientificNameID from WoRMS
%>% distinct(verbatimIdentification, scientificName, scientificNameID, kingdom) cleaned_fish
Transform dataset into DwCA v2
Darwin Core Data Package
The mapping is based on schema from https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/
Clean fish data
Clean the fish data so that the fields are ready for use for further manipulation.
Main tables
Agent
There is only Anton here because this version of dataset does not contain squid records (from the catch).
<- tibble(
agent agentID = c("AVdP_collector", "AVdP_specimen_identifier"),
agentType = c("person", "person"),
preferredAgentName = c("Anton Van de Putte", "Anton Van de Putte")
)
agent
Event
How do we determine what eventType
to use? e.g. an Event can be both Survey
and MaterialGathering
.
# Create subevents with net because it is the lowest event with organisms
<- event_google %>%
subevents filter(eventID != "BROKE_WEST") %>% # Exclude the main event
mutate(eventType = "Survey - net type") %>% # Set eventType for subevents
expand_grid(suffix = c("_RMT1", "_RMT8")) %>% # Create subevent suffixes with net size
mutate(
eventID = paste0(eventID, suffix), # Append suffix to eventID
parentEventID = eventID %>% sub("_RMT[18]$", "", .) # Assign correct parent
%>%
) select(-suffix) # Remove the suffix column
# Combine original and subevents
<- bind_rows(event_google, subevents)
event %>% filter(str_starts(eventID, "BROKE_WEST_RMT_001")) %>% select(eventID, parentEventID, eventType) event
Material
Material includes:
- preserved individual fish
- stomach content of individual fish
Can stomach content material have collectedBy
and collectedByID
?
<- cleaned_fish %>%
individual_fish mutate(
materialEntityType = "whole organism",
collectedBy = "Anton Van de Putte",
collectedByID = "https://orcid.org/0000-0003-1336-5554",
identifiedBy = "Anton Van de Putte",
identifiedByID = "https://orcid.org/0000-0003-1336-5554",
identificationRemarks = case_when(
str_detect(COMMENTS, "closely resembles") ~ COMMENTS,
str_detect(COMMENTS, "Protomyctophum") ~ COMMENTS,
TRUE ~ ""
),materialEntityRemarks = case_when(
str_detect(COMMENTS, "BROKEN | Check | Microsattelite") ~ COMMENTS,
TRUE ~ ""
)%>%
) select(
materialEntityID, eventID, materialEntityType, preparations, disposition, collectedBy, collectedByID, identifiedBy, identifiedByID, verbatimIdentification, scientificName, scientificNameID, kingdom, taxonRank, identificationRemarks, materialEntityRemarks%>% rename(taxonID = scientificNameID) # keep verbatimIdentification, scientificName, scientificNameID, kingdom, taxonRank to construct Identification table
)
<- raw_stomach %>%
stomach rename(stomachItem = `Stomach Item`) %>%
mutate(
seqNum = sprintf("%03d", row_number()), # zero-padded sequential number
materialEntityID = str_c(`Fish ID`, "_stomach_", seqNum, "_", stomachItem),
materialEntityType = case_when(
== "St wall" ~ "stomach content - st wall",
stomachItem == "Mucus" ~ "stomach content - mucus",
stomachItem == "Other" ~ "stomach content - other",
stomachItem == "Facet eye" ~ "stomach content - facet eye",
stomachItem == "/" ~ "stomach content - empty",
stomachItem TRUE ~ "stomach content"),
derivedFromMaterialEntityID = `Fish ID`,
derivationType = "stomach content of",
partOfMaterialEntityID = `Fish ID`,
identifiedBy = "Anton Van de Putte",
identifiedByID = "https://orcid.org/0000-0003-1336-5554",
verbatimIdentification = case_when(
str_detect(stomachItem, "St wall|Mucus|Other|Facet eye|\\/") ~ "",
TRUE ~ stomachItem
),identificationRemarks = case_when(str_detect(Remarks, "cf.") ~ Remarks, TRUE ~ ""),
materialEntityRemarks = case_when(str_detect(Remarks, "cf.") ~ "", TRUE ~ Remarks),
%>%
) left_join(individual_fish, by = c("derivedFromMaterialEntityID" = "materialEntityID")) %>%
# because left_join renamed the following columns since it is present in both individual_fish and stomach table
mutate(materialEntityType = materialEntityType.x,
verbatimIdentification = verbatimIdentification.x,
materialEntityRemarks = materialEntityRemarks.x,
identificationRemarks = identificationRemarks.x,
identifiedBy = identifiedBy.x,
identifiedByID = identifiedByID.x) %>%
select(materialEntityID, eventID, materialEntityType, identifiedBy, identifiedByID, verbatimIdentification, identificationRemarks, derivedFromMaterialEntityID, derivationType, partOfMaterialEntityID, materialEntityRemarks)
# combine both tables into a Material table
<- bind_rows(individual_fish, stomach)
material
# preview example of a full fish + its stomach content in Material table
%>% filter(str_starts(materialEntityID, "AAV3FF_00025")) material
Identification
Identification terms are mostly part of Material table.
Question: I am expecting OBIS to ask about why taxonID
and not scientificNameID
.
<- "https://archive.org/details/fishesofsouthern00gono | Kellermann, A. (1990): Identification Key and Catalogue of Larval Antarctic Fishes , Berichte zur Polarforschung (Reports on Polar Research), Bremerhaven, Alfred Wegener Institute for Polar and Marine Research, 67 , 136 p. . doi: 10.2312/BzP_0067_1990 | https://archive.org/details/cybium-7-002-003-074"
fish_id_guide
<- material %>%
identification mutate(
identificationID = paste(materialEntityID, "ID", sep = "_"),
identificationBasedOnMaterialEntityID = materialEntityID,
identificationType = "MaterialEntity",
identificationTypeIRI = "http://rs.tdwg.org/dwc/terms/MaterialEntity",
identifiedBy = "Anton Van de Putte",
identifiedByID = "https://orcid.org/0000-0003-1336-5554",
identificationReferences = fish_id_guide) %>%
select(identificationID, identificationBasedOnMaterialEntityID, identificationType, identificationTypeIRI, verbatimIdentification, identifiedBy, identifiedByID, identificationReferences, identificationRemarks, taxonID, kingdom, scientificName, taxonRank)
head(identification)
Media
<- media_google %>%
media left_join(material, by = "materialEntityID") %>%
# remove these media records because there is no data associated with these pictures -> we don't know what these are
filter(!is.na(eventID)) %>%
mutate(mediaDescription = str_c("Photo of ", scientificName, " with its label and a grid."))
head(media)
Survey
Survey table is a table that contains current Humboldt Extension terms.
<- c("_RMT1","_RMT8")
suffixes
<- humboldt %>%
survey filter(eventID != "BROKE_WEST") %>% # remove the expedition event, only link humboldt to the lowest level of Survey Events
expand_grid(suffix = suffixes) %>%
mutate(eventID = paste0(eventID, suffix),
# need to cast netSize to string so that it can be joined later (so that it is not a double)
netSize = case_when(suffix == "_RMT1" ~ "1", suffix == "_RMT8" ~ "8", TRUE ~ NA),
surveyID = eventID) %>%
# remove fields that were empty or need to be amended based on new understanding of survey target
select(-`ecoiri:targetTaxonomicScope`, -verbatimSiteDescriptions, -suffix)
head(survey)
Protocol
protocol
Survey Target & Occurrence
After extensive discussions on what target
is, we decided to create survey-target table retrospectively. SurveyTarget needs to be created per net size because it is linked to a Survey (lowest level of Survey is per net size) via SurveyID. SurveyTargetAbundance points to the Survey through the SurveyTarget record.
Challenge: There is only total count for fish from RMT8 and not RMT1. The total count also does not contain any sex
and lifeStage
for all the taxa except Electrona antarctica.
<- function(df){
transform_rmt_count_to_long #' Convert wide count table into long table
#'
#' @param df A tibble of the count table with taxon as columns and trawl event as rows
#' @return A list of 2 tibbles, one is survey_target, the other is survey_target_abundance table
<- df %>%
trawl pivot_longer(cols = 5:ncol(df),
names_to = "verbatimIdentification",
values_to = "organismQuantity",
values_drop_na = TRUE) %>%
mutate(
organismQuantityType = case_when(organismQuantity >= 1 ~ "individuals", TRUE ~ "individual"),
life_stage = case_when(
str_detect(verbatimIdentification, " all$") ~ "",
str_detect(verbatimIdentification, " Larvae$") ~ "Larvae",
str_detect(verbatimIdentification, " Postmeta$") ~ "Postmeta",
TRUE ~ ""
),lifeStage = life_stage, # create extra lifeStage column for occurrence table
# use field name "taxon" for surveyTargetType vocabulary later
taxon = str_remove_all(verbatimIdentification, "\\s*(all|Larvae|Postmeta|sp\\.)$"),
station_number_cleaned = str_remove_all(`Station number`, "\\s*"), # Remove white spaces
surveyID = str_c("BROKE_WEST_RMT_", station_number_cleaned, "_RMT8"),
# need to cast body size to string otherwise double cannot be in the same column with string after pivot_long
# only use size range for RMT8 here because there is no count data for RMT1
`minimum body size` = "0.85",
`maximum body size` = "3"
)
<- wm_records_names(unique(trawl$taxon)) %>%
trawl_taxa # the taxon match returns list of lists, this function creates a table of the following fields out of the lists
map_df(~ select(.x, scientificname, lsid, AphiaID, rank, kingdom)) %>%
rename(taxon = scientificname, taxonID = lsid, taxonRank = rank) %>%
mutate(taxon_rank = taxonRank) # create extra taxonRank column for occurrence table
<- trawl %>%
trawl_count left_join(trawl_taxa, by = "taxon") %>%
mutate(surveyTargetID = case_when(
!= "" ~ str_c(AphiaID, lifeStage, sep = "_"),
lifeStage TRUE ~ as.character(AphiaID)))
<- trawl_count %>%
trawl_long pivot_longer(
cols = c("taxon", "life_stage", "taxon_rank", "minimum body size", "maximum body size"),
names_to = "surveyTargetType",
values_to = "surveyTargetValue",
values_drop_na = TRUE
%>%
) mutate(
surveyTargetUnit = case_when(
== "minimum body size" ~ "mm",
surveyTargetType == "maximum body size" ~ "m",
surveyTargetType TRUE ~ NA
),surveyTargetUnitIRI = case_when(
== "mm" ~ "http://vocab.nerc.ac.uk/collection/P06/current/UXMM/",
surveyTargetUnit == "m" ~ "http://vocab.nerc.ac.uk/collection/P06/current/ULAA/",
surveyTargetUnit TRUE ~ NA
),surveyTargetValueIRI = case_when(
== "taxon" ~ taxonID,
surveyTargetType == "Genus" ~ "http://rs.tdwg.org/dwc/terms/genus",
surveyTargetValue == "Species" ~ "http://rs.tdwg.org/dwc/terms/specificEpithet",
surveyTargetValue == "Family" ~ "http://rs.tdwg.org/dwc/terms/family",
surveyTargetValue TRUE ~ NA
),surveyTargetType = case_when(
== "life_stage" ~ "life stage",
surveyTargetType == "taxon_rank" ~ "taxon rank",
surveyTargetType TRUE ~ surveyTargetType
),includeOrExclude = "include",
isSurveyTargetFullyReported = "true"
%>% filter(!(surveyTargetType == "life stage" & surveyTargetValue == "")) # remove empty lifeStage row
)
<- trawl_long %>%
survey_target select(surveyTargetID, surveyID, surveyTargetType, surveyTargetValue, surveyTargetValueIRI, surveyTargetUnit, surveyTargetUnitIRI)
<- trawl_long %>%
occurrence filter(surveyTargetType == "taxon" & organismQuantity != 0) %>%
rename(
scientificName = surveyTargetValue,
eventID = surveyID) %>%
mutate(
occurrenceStatus = "present",
recordedBy = "Anton Van de Putte",
recordedByID = "https://orcid.org/0000-0003-1336-5554",
identifiedBy = "Anton Van de Putte",
identifiedByID = "https://orcid.org/0000-0003-1336-5554",
)return(list(survey_target = survey_target, occurrence = occurrence))
}
<- transform_rmt_count_to_long(raw_count_routine)
routine_trawl <- transform_rmt_count_to_long(raw_count_target)
target_trawl
<- rbind(routine_trawl$survey_target, target_trawl$survey_target)
survey_target <- rbind(routine_trawl$occurrence, target_trawl$occurrence) %>%
occurrence mutate(occurrenceID = str_c(eventID, surveyTargetID, sep = "_")) %>%
select(occurrenceID, eventID, recordedBy, recordedByID, identifiedBy, identifiedByID, verbatimIdentification, scientificName, taxonID, taxonRank, kingdom,lifeStage, organismQuantityType, organismQuantity)
# preview example tables
head(survey_target)
head(occurrence)
Common tables
Agent role
Agent Agent role
<- tibble(
agent_agent_role agentID = "AVdP_collector",
agentRole = "collector",
) agent_agent_role
Event Agent role
<- event %>%
event_agent_role select(eventID) %>% filter(eventID != "BROKE_WEST") %>%
mutate(agentID = "AVdP_collector", agentRole = "collector")
%>% filter(eventID == "BROKE_WEST_RMT_001") event_agent_role
Identification Agent role
<- identification %>%
identification_agent_role select(identificationID) %>%
mutate(agentID = "AVdP_specimen_identifier", agentRole = "specimen identifier")
head(identification_agent_role)
Survey Agent role
<- survey %>%
survey_agent_role select(surveyID) %>%
mutate(agentID = "AVdP_collector", agentRole = "collector")
head(survey_agent_role)
Assertion
Event Assertion
Placing environmental measurements under Event Assertion instead of Survey Assertion because it is for both RMT1 and RMT8. Putting this at the level of station Event and not net Event.
Expedition
|__ Sampling station
|__ RMT1
|__ RMT8
<- emof %>%
assertion rename(assertionID = measurementID, assertionType = measurementType, assertionTypeIRI = measurementTypeID, assertionValueIRI = measurementValueID, assertionUnit = measurementUnit, assertionUnitIRI = measurementUnitID) %>%
mutate(assertionValue = if_else(grepl("[a-zA-Z]", measurementValue), measurementValue, NA),
assertionValueNumeric = if_else(!grepl("[a-zA-Z]", measurementValue), as.numeric(measurementValue), NA))
<- assertion %>%
event_assertion filter(is.na(occurrenceID)) %>%
mutate(
assertionTypeVocabulary = "http://vocab.nerc.ac.uk/collection/P01/current/",
assertionProtocol = case_when(
%in% c("Average temperature", "Maximum Pressure", "Average Salinity", "Average Conductivity", "Average fluoresence") ~ "CTD",
assertionType %in% c("Solar Azimuth", "Solar elevation", "daylength") ~ "solar azimuth, elevation and day length",
assertionType TRUE ~ ""),
assertionProtocolID = case_when(
== "CTD" ~ "ctd",
assertionProtocol == "solar azimuth, elevation and day length" ~ "light_conditions",
assertionProtocol TRUE ~ ""),
assertionUnitVocabulary = "http://vocab.nerc.ac.uk/collection/P06/current/"
%>%
) select(assertionID, eventID, assertionType, assertionTypeIRI, assertionTypeVocabulary, assertionValue,
assertionValueNumeric, assertionValueIRI, assertionUnit, assertionUnitIRI, assertionUnitVocabulary, assertionProtocol, assertionProtocolID)%>% filter(assertionID == "BROKE_WEST_RMT_028_temp") event_assertion
Material Assertion
<- assertion %>%
material_assertion filter(!is.na(occurrenceID)) %>%
rename(materialEntityID = occurrenceID) %>%
select(-"assertionValueIRI", -"eventID")
%>% filter(materialEntityID == "AAV3FF_00001") material_assertion
Media
Material Media
<- media %>%
material_media select(mediaID, materialEntityID) %>%
mutate(mediaSubjectCategory = "specimen photo")
head(material_media)
Protocol
Survey Protocol
As Event is also a Survey, I use Survey Protocol table instead of Event Protocol table.
<- survey %>%
survey_protocol select(surveyID, samplingProtocol) %>%
rename(protocolID = samplingProtocol) %>%
filter(!is.na(protocolID))
%>% filter(surveyID == "BROKE_WEST_RMT_001_RMT8") survey_protocol
Identifier
Agent Identifier
<- tibble(
agent_identifier identifier = c("https://orcid.org/0000-0003-1336-5554", "https://orcid.org/0000-0003-1336-5554"),
agentID = c("AVdP_collector", "AVdP_specimen_identifier"),
identifierType = c("ORCID", "ORCID"),
identifierLanguage = c("eng", "eng")
) agent_identifier