Transform dataset into DwCA v2

Published

February 21, 2025

Darwin Core Data Package

The mapping is based on schema from https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/

Clean fish data

Clean the fish data so that the fields are ready for use for further manipulation.

library(worrms)

cleaned_fish <- raw_fish_data %>%
  # exclude those without sample number
  filter(`Sample number` != "/") %>%
  rename(
    family = Family,
    preparations = presrvation,
    disposition = Location,
    lifeStage = C1,
    materialEntityID = `Sample number`,
    netSize = `Net Size`
  ) %>%
  mutate(
    station_number_cleaned = str_remove_all(`Station number`, "\\s*"), # Remove white spaces
    eventID = str_c("BROKE_WEST_RMT_", station_number_cleaned, "_RMT", `netSize`), 
    verbatimIdentification = case_when(
      Species == "Unidentified" ~ "", 
      Species == "/" ~ "",
      TRUE ~ Species),
    scientificName = case_when(
      grepl(" sp\\.$", verbatimIdentification) ~ sub(" sp\\.$", "", verbatimIdentification),
      # correct typo of scientificName based on WoRMS
      verbatimIdentification == "Protomytophum bollini" ~ "Protomyctophum bolini",
      verbatimIdentification == "Artedidraco scottsbergi" ~ "Artedidraco skottsbergi",
      verbatimIdentification == "Gymnoscophelus braueri" ~ "Gymnoscopelus braueri",
      # Unidentified == Biota because lifeStage = larvae for these records. Can we be sure that these are fish?
      Species == "Unidentified" ~ "Biota", 
      Species == "/" ~ "",
      TRUE ~ verbatimIdentification
    )
    # no more identificationQualifier, use verbatimIdentification :)
  )
  
# match unique scientificName to WoRMS to obtain LSID of AphiaID
matched_taxa <- wm_records_names(unique(cleaned_fish$scientificName)) %>%
  # the taxon match returns list of lists, this function creates a table of the following fields out of the lists
  # keep aphiaID for surveyTargetID
  map_df(~ select(.x, scientificname, lsid, rank, kingdom, AphiaID)) %>%
  rename(scientificName = scientificname, scientificNameID = lsid, taxonRank = rank)

# join table to get scientificNameID and taxonRank
cleaned_fish <- cleaned_fish %>% 
  left_join(matched_taxa, by = "scientificName")

# preview distinct verbatimIdentification and matched scientificName and scientificNameID from WoRMS
cleaned_fish %>% distinct(verbatimIdentification, scientificName, scientificNameID, kingdom)

Main tables

Agent

There is only Anton here because this version of dataset does not contain squid records (from the catch).

agent <- tibble(
  agentID = c("AVdP_collector", "AVdP_specimen_identifier"),
  agentType = c("person", "person"),
  preferredAgentName = c("Anton Van de Putte", "Anton Van de Putte")
)

agent

Event

How do we determine what eventType to use? e.g. an Event can be both Survey and MaterialGathering.

# Create subevents with net because it is the lowest event with organisms
subevents <- event_google %>%
  filter(eventID != "BROKE_WEST") %>% # Exclude the main event
  mutate(eventType = "Survey - net type") %>% # Set eventType for subevents
  expand_grid(suffix = c("_RMT1", "_RMT8")) %>% # Create subevent suffixes with net size
  mutate(
    eventID = paste0(eventID, suffix), # Append suffix to eventID
    parentEventID = eventID %>% sub("_RMT[18]$", "", .) # Assign correct parent
  ) %>%
  select(-suffix) # Remove the suffix column

# Combine original and subevents
event <- bind_rows(event_google, subevents)
event %>% filter(str_starts(eventID, "BROKE_WEST_RMT_001")) %>% select(eventID, parentEventID, eventType)

Material

Material includes:

preserved individual fish
stomach content of individual fish

Can stomach content material have collectedBy and collectedByID?

individual_fish <- cleaned_fish %>% 
  mutate(
    materialEntityType = "whole organism",  
    collectedBy = "Anton Van de Putte",
    collectedByID = "https://orcid.org/0000-0003-1336-5554",
    identifiedBy = "Anton Van de Putte",
    identifiedByID = "https://orcid.org/0000-0003-1336-5554",
    identificationRemarks = case_when(
      str_detect(COMMENTS, "closely resembles") ~ COMMENTS,
      str_detect(COMMENTS, "Protomyctophum") ~ COMMENTS,
      TRUE ~ ""
    ),
    materialEntityRemarks = case_when(
      str_detect(COMMENTS, "BROKEN | Check | Microsattelite") ~ COMMENTS,
      TRUE ~ ""
    )
    ) %>% 
  select(
    materialEntityID, eventID, materialEntityType, preparations, disposition, collectedBy, collectedByID, identifiedBy, identifiedByID, verbatimIdentification, scientificName, scientificNameID, kingdom, taxonRank, identificationRemarks, materialEntityRemarks
  ) %>% rename(taxonID = scientificNameID) # keep verbatimIdentification, scientificName, scientificNameID, kingdom, taxonRank to construct Identification table

stomach <- raw_stomach %>%
  rename(stomachItem = `Stomach Item`) %>%
  mutate(
    seqNum = sprintf("%03d", row_number()), # zero-padded sequential number
    materialEntityID = str_c(`Fish ID`, "_stomach_", seqNum, "_", stomachItem),
    materialEntityType = case_when(
      stomachItem == "St wall" ~ "stomach content - st wall",
      stomachItem == "Mucus" ~ "stomach content - mucus",
      stomachItem == "Other" ~ "stomach content - other",
      stomachItem == "Facet eye" ~ "stomach content - facet eye",
      stomachItem == "/" ~ "stomach content - empty",
      TRUE ~ "stomach content"),
    derivedFromMaterialEntityID = `Fish ID`,
    derivationType = "stomach content of",
    partOfMaterialEntityID = `Fish ID`,
    identifiedBy = "Anton Van de Putte",
    identifiedByID = "https://orcid.org/0000-0003-1336-5554",
    verbatimIdentification = case_when(
      str_detect(stomachItem, "St wall|Mucus|Other|Facet eye|\\/") ~  "",
      TRUE ~ stomachItem
    ),
    identificationRemarks = case_when(str_detect(Remarks, "cf.") ~ Remarks, TRUE ~ ""),
    materialEntityRemarks = case_when(str_detect(Remarks, "cf.") ~ "", TRUE ~ Remarks),
  ) %>% 
  left_join(individual_fish, by = c("derivedFromMaterialEntityID" = "materialEntityID")) %>%
  # because left_join renamed the following columns since it is present in both individual_fish and stomach table
  mutate(materialEntityType = materialEntityType.x,
         verbatimIdentification = verbatimIdentification.x,
         materialEntityRemarks = materialEntityRemarks.x,
         identificationRemarks = identificationRemarks.x,
         identifiedBy = identifiedBy.x,
         identifiedByID = identifiedByID.x) %>% 
  select(materialEntityID, eventID, materialEntityType, identifiedBy, identifiedByID, verbatimIdentification, identificationRemarks, derivedFromMaterialEntityID, derivationType, partOfMaterialEntityID, materialEntityRemarks)

# combine both tables into a Material table
material <- bind_rows(individual_fish, stomach) 

# preview example of a full fish + its stomach content in Material table
material %>% filter(str_starts(materialEntityID, "AAV3FF_00025"))

Identification

Identification terms are mostly part of Material table.

Question: I am expecting OBIS to ask about why taxonID and not scientificNameID.

fish_id_guide <- "https://archive.org/details/fishesofsouthern00gono | Kellermann, A. (1990): Identification Key and Catalogue of Larval Antarctic Fishes , Berichte zur Polarforschung (Reports on Polar Research), Bremerhaven, Alfred Wegener Institute for Polar and Marine Research, 67 , 136 p. . doi: 10.2312/BzP_0067_1990 | https://archive.org/details/cybium-7-002-003-074"

identification <- material %>%
  mutate(
    identificationID = paste(materialEntityID, "ID", sep = "_"),
    identificationBasedOnMaterialEntityID = materialEntityID,
    identificationType = "MaterialEntity",
    identificationTypeIRI = "http://rs.tdwg.org/dwc/terms/MaterialEntity",
    identifiedBy = "Anton Van de Putte",
    identifiedByID = "https://orcid.org/0000-0003-1336-5554",
    identificationReferences = fish_id_guide) %>%
  select(identificationID, identificationBasedOnMaterialEntityID, identificationType, identificationTypeIRI, verbatimIdentification, identifiedBy, identifiedByID, identificationReferences, identificationRemarks, taxonID, kingdom, scientificName, taxonRank)
head(identification)

Media

media <- media_google %>% 
  left_join(material, by = "materialEntityID") %>% 
  # remove these media records because there is no data associated with these pictures -> we don't know what these are
  filter(!is.na(eventID)) %>%
  mutate(mediaDescription = str_c("Photo of ", scientificName, " with its label and a grid.")) 
head(media)

Survey

Survey table is a table that contains current Humboldt Extension terms.

suffixes <- c("_RMT1","_RMT8")

survey <- humboldt %>%
  filter(eventID != "BROKE_WEST") %>% # remove the expedition event, only link humboldt to the lowest level of Survey Events
  expand_grid(suffix = suffixes) %>%
  mutate(eventID = paste0(eventID, suffix),
         # need to cast netSize to string so that it can be joined later (so that it is not a double)
         netSize = case_when(suffix == "_RMT1" ~ "1", suffix == "_RMT8" ~ "8", TRUE ~ NA),
         surveyID = eventID) %>%
  # remove fields that were empty or need to be amended based on new understanding of survey target
  select(-`ecoiri:targetTaxonomicScope`, -verbatimSiteDescriptions, -suffix)
head(survey)

Protocol

protocol

Survey Target & Occurrence

After extensive discussions on what target is, we decided to create survey-target table retrospectively. SurveyTarget needs to be created per net size because it is linked to a Survey (lowest level of Survey is per net size) via SurveyID. SurveyTargetAbundance points to the Survey through the SurveyTarget record.

Challenge: There is only total count for fish from RMT8 and not RMT1. The total count also does not contain any sex and lifeStage for all the taxa except Electrona antarctica.

transform_rmt_count_to_long <- function(df){
  #' Convert wide count table into long table
  #' 
  #' @param df A tibble of the count table with taxon as columns and trawl event as rows
  #' @return A list of 2 tibbles, one is survey_target, the other is survey_target_abundance table
  trawl <- df %>%
  pivot_longer(cols = 5:ncol(df),
               names_to = "verbatimIdentification",
               values_to = "organismQuantity",
               values_drop_na = TRUE) %>%
  mutate(
    organismQuantityType = case_when(organismQuantity >= 1 ~ "individuals", TRUE ~ "individual"),
    life_stage = case_when(
      str_detect(verbatimIdentification, " all$") ~ "",
      str_detect(verbatimIdentification, " Larvae$") ~ "Larvae",
      str_detect(verbatimIdentification, " Postmeta$") ~ "Postmeta",
      TRUE ~ ""
    ),
    lifeStage = life_stage, # create extra lifeStage column for occurrence table
    # use field name "taxon" for surveyTargetType vocabulary later
    taxon = str_remove_all(verbatimIdentification, "\\s*(all|Larvae|Postmeta|sp\\.)$"),
    station_number_cleaned = str_remove_all(`Station number`, "\\s*"), # Remove white spaces
    surveyID = str_c("BROKE_WEST_RMT_", station_number_cleaned, "_RMT8"),
    # need to cast body size to string otherwise double cannot be in the same column with string after pivot_long
    # only use size range for RMT8 here because there is no count data for RMT1 
    `minimum body size` = "0.85",
    `maximum body size` = "3"
  )

trawl_taxa <- wm_records_names(unique(trawl$taxon)) %>%
  # the taxon match returns list of lists, this function creates a table of the following fields out of the lists
  map_df(~ select(.x, scientificname, lsid, AphiaID, rank, kingdom)) %>%
  rename(taxon = scientificname, taxonID = lsid, taxonRank = rank) %>%
  mutate(taxon_rank = taxonRank) # create extra taxonRank column for occurrence table

trawl_count <- trawl %>% 
  left_join(trawl_taxa, by = "taxon") %>%
  mutate(surveyTargetID = case_when(
    lifeStage != "" ~ str_c(AphiaID, lifeStage, sep = "_"),
    TRUE ~ as.character(AphiaID)))

trawl_long <- trawl_count %>%
  pivot_longer(
    cols = c("taxon", "life_stage", "taxon_rank", "minimum body size", "maximum body size"),
    names_to = "surveyTargetType",
    values_to = "surveyTargetValue",
    values_drop_na = TRUE
  ) %>%
  mutate(
    surveyTargetUnit = case_when(
      surveyTargetType == "minimum body size" ~ "mm",
      surveyTargetType == "maximum body size" ~ "m",
      TRUE ~ NA
    ),
    surveyTargetUnitIRI = case_when(
      surveyTargetUnit == "mm" ~ "http://vocab.nerc.ac.uk/collection/P06/current/UXMM/",
      surveyTargetUnit == "m" ~ "http://vocab.nerc.ac.uk/collection/P06/current/ULAA/",
      TRUE ~ NA
    ),
    surveyTargetValueIRI = case_when(
      surveyTargetType == "taxon" ~ taxonID,
      surveyTargetValue == "Genus" ~ "http://rs.tdwg.org/dwc/terms/genus",
      surveyTargetValue == "Species" ~ "http://rs.tdwg.org/dwc/terms/specificEpithet",
      surveyTargetValue == "Family" ~ "http://rs.tdwg.org/dwc/terms/family",
      TRUE ~ NA
    ),
    surveyTargetType = case_when(
      surveyTargetType == "life_stage" ~ "life stage",
      surveyTargetType == "taxon_rank" ~ "taxon rank",
      TRUE ~ surveyTargetType
    ),
    includeOrExclude = "include",
    isSurveyTargetFullyReported = "true"
  ) %>% filter(!(surveyTargetType == "life stage" & surveyTargetValue == "")) # remove empty lifeStage row

survey_target <- trawl_long %>%
  select(surveyTargetID, surveyID, surveyTargetType, surveyTargetValue, surveyTargetValueIRI, surveyTargetUnit, surveyTargetUnitIRI)

occurrence <- trawl_long %>%
  filter(surveyTargetType == "taxon" & organismQuantity != 0) %>%
  rename(
    scientificName = surveyTargetValue, 
    eventID = surveyID) %>%
  mutate(
    occurrenceStatus = "present",
    recordedBy = "Anton Van de Putte",
    recordedByID = "https://orcid.org/0000-0003-1336-5554",
    identifiedBy = "Anton Van de Putte",
    identifiedByID = "https://orcid.org/0000-0003-1336-5554",
  )
return(list(survey_target = survey_target, occurrence = occurrence))
}

routine_trawl <- transform_rmt_count_to_long(raw_count_routine)
target_trawl <- transform_rmt_count_to_long(raw_count_target)

survey_target <- rbind(routine_trawl$survey_target, target_trawl$survey_target)
occurrence <- rbind(routine_trawl$occurrence, target_trawl$occurrence) %>%
  mutate(occurrenceID = str_c(eventID, surveyTargetID, sep = "_")) %>%
  select(occurrenceID, eventID, recordedBy, recordedByID, identifiedBy, identifiedByID, verbatimIdentification, scientificName, taxonID, taxonRank, kingdom,lifeStage, organismQuantityType, organismQuantity)

# preview example tables
head(survey_target)

head(occurrence)

Common tables

Agent role

Agent Agent role

agent_agent_role <- tibble(
  agentID = "AVdP_collector",
  agentRole = "collector",
)
agent_agent_role

Event Agent role

event_agent_role <- event %>% 
  select(eventID) %>% filter(eventID != "BROKE_WEST") %>%
  mutate(agentID = "AVdP_collector", agentRole = "collector")
event_agent_role %>% filter(eventID == "BROKE_WEST_RMT_001")

Identification Agent role

identification_agent_role <- identification %>% 
  select(identificationID) %>%
  mutate(agentID = "AVdP_specimen_identifier", agentRole = "specimen identifier")
head(identification_agent_role)

Survey Agent role

survey_agent_role <- survey %>% 
  select(surveyID) %>%
  mutate(agentID = "AVdP_collector", agentRole = "collector")
head(survey_agent_role)

Assertion

Event Assertion

Placing environmental measurements under Event Assertion instead of Survey Assertion because it is for both RMT1 and RMT8. Putting this at the level of station Event and not net Event.

Expedition
|__ Sampling station
    |__ RMT1
    |__ RMT8

assertion <- emof %>% 
  rename(assertionID = measurementID, assertionType = measurementType, assertionTypeIRI = measurementTypeID, assertionValueIRI = measurementValueID, assertionUnit = measurementUnit, assertionUnitIRI = measurementUnitID) %>%
  mutate(assertionValue = if_else(grepl("[a-zA-Z]", measurementValue), measurementValue, NA),
    assertionValueNumeric = if_else(!grepl("[a-zA-Z]", measurementValue), as.numeric(measurementValue), NA))

event_assertion <- assertion %>% 
  filter(is.na(occurrenceID)) %>%
  mutate(
    assertionTypeVocabulary = "http://vocab.nerc.ac.uk/collection/P01/current/",
    assertionProtocol = case_when(
      assertionType %in% c("Average temperature", "Maximum Pressure", "Average Salinity", "Average Conductivity", "Average fluoresence") ~ "CTD",
      assertionType %in% c("Solar Azimuth", "Solar elevation", "daylength") ~ "solar azimuth, elevation and day length",
      TRUE ~ ""),
    assertionProtocolID = case_when(
      assertionProtocol == "CTD" ~ "ctd",
      assertionProtocol == "solar azimuth, elevation and day length" ~ "light_conditions",
      TRUE ~ ""),
    assertionUnitVocabulary = "http://vocab.nerc.ac.uk/collection/P06/current/" 
    ) %>%
  select(assertionID, eventID, assertionType, assertionTypeIRI, assertionTypeVocabulary, assertionValue,
         assertionValueNumeric, assertionValueIRI, assertionUnit, assertionUnitIRI, assertionUnitVocabulary, assertionProtocol, assertionProtocolID)
event_assertion %>% filter(assertionID == "BROKE_WEST_RMT_028_temp")

Material Assertion

material_assertion <- assertion %>% 
  filter(!is.na(occurrenceID)) %>%
  rename(materialEntityID = occurrenceID) %>%
  select(-"assertionValueIRI", -"eventID")
material_assertion %>% filter(materialEntityID == "AAV3FF_00001")

Media

Material Media

material_media <- media %>%
  select(mediaID, materialEntityID) %>%
  mutate(mediaSubjectCategory = "specimen photo")
head(material_media)

Protocol

Survey Protocol

As Event is also a Survey, I use Survey Protocol table instead of Event Protocol table.

survey_protocol <- survey %>% 
  select(surveyID, samplingProtocol) %>% 
  rename(protocolID = samplingProtocol) %>%
  filter(!is.na(protocolID))
survey_protocol %>% filter(surveyID == "BROKE_WEST_RMT_001_RMT8")

Identifier

Agent Identifier

agent_identifier <- tibble(
  identifier = c("https://orcid.org/0000-0003-1336-5554", "https://orcid.org/0000-0003-1336-5554"),
  agentID = c("AVdP_collector", "AVdP_specimen_identifier"),
  identifierType = c("ORCID", "ORCID"),
  identifierLanguage = c("eng", "eng")
)
agent_identifier