Transform dataset into DwC-DP

Published

April 24, 2025

Darwin Core Data Package

The mapping is based on schema from https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/

Clean fish data

Clean the fish data so that the fields are ready for use for further manipulation.

library(worrms)

cleaned_fish <- raw_fish_data %>%
  # exclude those without sample number
  filter(`Sample number` != "/") %>%
  rename(
    family = Family,
    preparations = presrvation,
    lifeStage = C1,
    materialEntityID = `Sample number`,
    netSize = `Net Size`
  ) %>%
  mutate(
    station_number_cleaned = str_remove_all(`Station number`, "\\s*"), # Remove white spaces
    eventID = str_c("BROKE_WEST_RMT_", station_number_cleaned, "_RMT", `netSize`), 
    disposition = case_when(
      Location == "Anton" ~ "pending accession into Institue of Natural Sciences' collection",
      # not sure where the samples of Evgeny and Megan go
      TRUE ~ ""),
    verbatimIdentification = case_when(
      Species == "Unidentified" ~ "", 
      Species == "/" ~ "",
      TRUE ~ Species),
    scientificName = case_when(
      grepl(" sp\\.$", verbatimIdentification) ~ sub(" sp\\.$", "", verbatimIdentification),
      # correct typo of scientificName based on WoRMS
      verbatimIdentification == "Protomytophum bollini" ~ "Protomyctophum bolini",
      verbatimIdentification == "Artedidraco scottsbergi" ~ "Artedidraco skottsbergi",
      verbatimIdentification == "Gymnoscophelus braueri" ~ "Gymnoscopelus braueri",
      # Unidentified == Biota because lifeStage = larvae for these records. Can we be sure that these are fish?
      Species == "Unidentified" ~ "Biota", 
      Species == "/" ~ "",
      TRUE ~ verbatimIdentification
    )
    # no more identificationQualifier, use verbatimIdentification :)
  )
  
# match unique scientificName to WoRMS to obtain LSID of AphiaID
matched_taxa <- wm_records_names(unique(cleaned_fish$scientificName)) %>%
  # the taxon match returns list of lists, this function creates a table of the following fields out of the lists
  # keep aphiaID for surveyTargetID
  map_df(~ select(.x, scientificname, lsid, rank, kingdom, AphiaID)) %>%
  rename(scientificName = scientificname, scientificNameID = lsid, taxonRank = rank)

# join table to get scientificNameID and taxonRank
cleaned_fish <- cleaned_fish %>% 
  left_join(matched_taxa, by = "scientificName")

# preview distinct verbatimIdentification and matched scientificName and scientificNameID from WoRMS
cleaned_fish %>% distinct(verbatimIdentification, scientificName, scientificNameID, kingdom)

Main tables

Agent

There is only Anton here because this version of dataset does not contain squid records (from the catch).

agent <- tibble(
  agentID = "https://orcid.org/0000-0003-1336-5554",
  agentType = "person",
  preferredAgentName = "Anton Van de Putte"
) %>% filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/agent.json")

Event

How do we determine what eventType to use? e.g. an Event can be both Survey and MaterialGathering.

# Create subevents with net because it is the lowest event with organisms
subevents <- event_google %>%
  filter(eventID != "BROKE_WEST") %>% # Exclude the main event
  mutate(eventType = "Survey - net type") %>% # Set eventType for subevents
  expand_grid(suffix = c("_RMT1", "_RMT8")) %>% # Create subevent suffixes with net size
  mutate(
    eventID = paste0(eventID, suffix), # Append suffix to eventID
    parentEventID = eventID %>% sub("_RMT[18]$", "", .) # Assign correct parent
  ) %>%
  select(-suffix) # Remove the suffix column
  
# Combine original and subevents
event <- bind_rows(event_google, subevents) %>%
  mutate(
    sampleSizeValue = case_when(
      str_detect(eventID, "_RMT8") ~ sampleSizeValue,
      TRUE ~ NA
      ),
    sampleSizeUnit = case_when(
    str_detect(eventID, "_RMT8") ~ sampleSizeUnit,
    TRUE ~ NA)
  ) %>%
  filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/event.json")
  
event %>% filter(str_starts(eventID, "BROKE_WEST_RMT_001")) %>% select(eventID, parentEventID, eventType)

Material

Material includes:

preserved individual fish
stomach content of individual fish

individual_fish <- cleaned_fish %>% 
  mutate(
    materialEntityType = "whole organism",  
    collectedBy = "Anton Van de Putte",
    collectedByID = "https://orcid.org/0000-0003-1336-5554",
    identifiedBy = "Anton Van de Putte",
    identifiedByID = "https://orcid.org/0000-0003-1336-5554",
    identificationRemarks = case_when(
      str_detect(COMMENTS, "closely resembles") ~ COMMENTS,
      str_detect(COMMENTS, "Protomyctophum") ~ COMMENTS,
      TRUE ~ ""
    ),
    materialEntityRemarks = case_when(
      str_detect(COMMENTS, "BROKEN | Check | Microsattelite") ~ COMMENTS,
      TRUE ~ ""
    )
    ) %>% 
  select(
    materialEntityID, eventID, materialEntityType, preparations, disposition, collectedBy, collectedByID, identifiedBy, identifiedByID, verbatimIdentification, scientificName, scientificNameID, kingdom, taxonRank, identificationRemarks, materialEntityRemarks
  ) %>% rename(taxonID = scientificNameID) # keep verbatimIdentification, scientificName, scientificNameID, kingdom, taxonRank to construct Identification table

stomach <- raw_stomach %>%
  rename(stomachItem = `Stomach Item`) %>%
  mutate(
    seqNum = sprintf("%03d", row_number()), # zero-padded sequential number
    materialEntityID = str_c(`Fish ID`, "_stomach_", seqNum, "_", stomachItem),
    materialEntityType = case_when(
      stomachItem == "St wall" ~ "stomach content - st wall",
      stomachItem == "Mucus" ~ "stomach content - mucus",
      stomachItem == "Other" ~ "stomach content - other",
      stomachItem == "Facet eye" ~ "stomach content - facet eye",
      stomachItem == "/" ~ "stomach content - empty",
      TRUE ~ "stomach content"),
    derivedFromMaterialEntityID = `Fish ID`,
    derivationType = "stomach content of",
    partOfMaterialEntityID = `Fish ID`,
    identifiedBy = "Anton Van de Putte",
    identifiedByID = "https://orcid.org/0000-0003-1336-5554",
    verbatimIdentification = case_when(
      str_detect(stomachItem, "St wall|Mucus|Other|Facet eye|\\/") ~  "",
      TRUE ~ stomachItem
    ),
    identificationRemarks = case_when(str_detect(Remarks, "cf.") ~ Remarks, TRUE ~ ""),
    materialEntityRemarks = case_when(str_detect(Remarks, "cf.") ~ "", TRUE ~ Remarks),
  ) %>% 
  left_join(individual_fish, by = c("derivedFromMaterialEntityID" = "materialEntityID")) %>%
  # because left_join renamed the following columns since it is present in both individual_fish and stomach table
  rename(materialEntityType = materialEntityType.x,
         verbatimIdentification = verbatimIdentification.x,
         materialEntityRemarks = materialEntityRemarks.x,
         identificationRemarks = identificationRemarks.x,
         identifiedBy = identifiedBy.x,
         identifiedByID = identifiedByID.x) 
# %>% 
#   select(materialEntityID, eventID, materialEntityType, identifiedBy, identifiedByID, verbatimIdentification, identificationRemarks, derivedFromMaterialEntityID, derivationType, partOfMaterialEntityID, materialEntityRemarks)

# combine both tables into a Material table
material_join_identification <- bind_rows(individual_fish, stomach) 

# preview example of a full fish + its stomach content in Material table
material <- material_join_identification %>%
  mutate(materialCategory = "preserved") %>%
  filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/material.json")
material %>% filter(str_starts(materialEntityID, "AAV3FF_00337"))

Identification

Identification terms are mostly part of Material table.

Question: I am expecting OBIS to ask about why taxonID and not scientificNameID.

fish_id_guide <- "https://archive.org/details/fishesofsouthern00gono | Kellermann, A. (1990): Identification Key and Catalogue of Larval Antarctic Fishes , Berichte zur Polarforschung (Reports on Polar Research), Bremerhaven, Alfred Wegener Institute for Polar and Marine Research, 67 , 136 p. . doi: 10.2312/BzP_0067_1990 | https://archive.org/details/cybium-7-002-003-074"

identification <- material_join_identification %>%
  mutate(
    identificationID = paste(materialEntityID, "ID", sep = "_"),
    basedOnMaterialEntityID = materialEntityID,
    identificationType = "MaterialEntity",
    identifiedBy = "Anton Van de Putte",
    identifiedByID = "https://orcid.org/0000-0003-1336-5554",
    identificationReferences = fish_id_guide) %>%
  filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/identification.json")
head(identification)

Media

media_material <- media_google %>% 
  # also needed to construct Material Media table later
  left_join(material, by = "materialEntityID") %>% 
  # remove these media records because there is no data associated with these pictures -> we don't know what these are
  filter(!is.na(eventID)) %>%
  mutate(mediaDescription = str_c("Photo of ", scientificName, " with its label and a grid.")) 

media <- filter_dataframe_by_json(media_material, "https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/media.json")
head(media)

Survey

Survey table is a table that contains current Humboldt Extension terms.

suffixes <- c("_RMT1","_RMT8")

survey_w_protocol <- humboldt %>%
  filter(eventID != "BROKE_WEST") %>% # remove the expedition event, only link humboldt to the lowest level of Survey Events
  expand_grid(suffix = suffixes) %>%
  mutate(eventID = paste0(eventID, suffix),
         # need to cast netSize to string so that it can be joined later (so that it is not a double)
         netSize = case_when(suffix == "_RMT1" ~ "1", suffix == "_RMT8" ~ "8", TRUE ~ NA),
         surveyID = eventID,
         isLeastSpecificTargetCategoryQuantityInclusive = "true") 

survey <- survey_w_protocol %>%
  filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/survey.json")
head(survey)

Protocol

protocol <- filter_dataframe_by_json(protocol, "https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/protocol.json")
protocol

Survey Target & Occurrence

After extensive discussions on what target is, we decided to create survey-target table retrospectively. SurveyTarget needs to be created per net size because it is linked to a Survey (lowest level of Survey is per net size) via SurveyID. SurveyTargetAbundance points to the Survey through the SurveyTarget record.

Challenge: There is only total count for fish from RMT8 and not RMT1. The total count also does not contain any sex and lifeStage for all the taxa except Electrona antarctica.

routine_trawl <- transform_rmt_count_to_long(raw_count_routine)
target_trawl <- transform_rmt_count_to_long(raw_count_target)

survey_target <- rbind(routine_trawl$survey_target, target_trawl$survey_target) %>%
  mutate(includeOrExclude = "include", isSurveyTargetFullyReported = "true") %>%
  filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/survey-target.json")
occurrence <- rbind(routine_trawl$occurrence, target_trawl$occurrence) %>%
  mutate(occurrenceID = str_c(eventID, surveyTargetID, sep = "_"),
         occurrenceStatus = "detected",
         organismQuantityType = case_when(organismQuantity <= 1 ~ "individual",
                                          organismQuantity > 1 ~ "individuals")) %>%
  filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/occurrence.json")

# preview example tables
head(survey_target)

head(occurrence)

Common tables

Assertion

Event Assertion

Placing environmental measurements under Event Assertion instead of Survey Assertion because it is for both RMT1 and RMT8. Putting this at the level of station Event and not net Event.

Expedition
|__ Sampling station
    |__ RMT1
    |__ RMT8

assertion <- emof %>% 
  rename(assertionID = measurementID, assertionType = measurementType, assertionTypeIRI = measurementTypeID, assertionValueIRI = measurementValueID, assertionUnit = measurementUnit, assertionUnitIRI = measurementUnitID) %>%
  mutate(assertionValue = if_else(grepl("[a-zA-Z]", measurementValue), measurementValue, NA),
    assertionValueNumeric = if_else(!grepl("[a-zA-Z]", measurementValue), as.numeric(measurementValue), NA))

event_assertion <- assertion %>% 
  filter(is.na(occurrenceID)) %>%
  mutate(
    eventID = case_when(
      # because sampled volume is only available for RMT8
      assertionType == "Sampled volume" ~ str_c(eventID, "_RMT8"),
      TRUE ~ eventID
    ),
    assertionID = case_when(
      assertionType == "Sampled volume" ~ str_c(eventID, "_RMT8_", assertionType),
      TRUE ~ assertionID
    ),
    assertionTypeVocabulary = "http://vocab.nerc.ac.uk/collection/P01/current/",
    assertionProtocol = case_when(
      assertionType %in% c("Average temperature", "Maximum Pressure", "Average Salinity", "Average Conductivity", "Average fluoresence") ~ "CTD",
      assertionType %in% c("Solar Azimuth", "Solar elevation", "daylength") ~ "solar azimuth, elevation and day length",
      TRUE ~ ""),
    assertionProtocolID = case_when(
      assertionProtocol == "CTD" ~ "ctd",
      assertionProtocol == "solar azimuth, elevation and day length" ~ "light_conditions",
      TRUE ~ ""),
    assertionUnitVocabulary = "http://vocab.nerc.ac.uk/collection/P06/current/" 
    ) %>%
  filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/event-assertion.json")
event_assertion %>% filter(assertionID == "BROKE_WEST_RMT_028_temp")

Material Assertion

fish_length_assertion <- assertion %>% 
  filter(!is.na(occurrenceID)) %>%
  rename(materialEntityID = occurrenceID) %>%
  filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/material-assertion.json")

fish_lifestage_assertion <- cleaned_fish %>%
  filter(!is.na(lifeStage)) %>%
  select(materialEntityID, lifeStage) %>%
  rename(assertionValue = lifeStage) %>%
  mutate(assertionID = str_c(materialEntityID, "_lifeStage"),
         assertionType = "life stage",
         assertionTypeIRI = "http://rs.tdwg.org/dwc/terms/lifeStage",
         assertionValueIRI = case_when(
           assertionValue == "Larvae" ~ "https://vocab.nerc.ac.uk/collection/S11/current/S1128/",
           TRUE ~ NA
         ),
         assertionUnit = NA,
         assertionUnitIRI = NA,
         assertionValueNumeric = NA) 

stomach_assertion <- stomach %>%
  select(materialEntityID, DoD, No) %>% 
  pivot_longer(
    cols = c(DoD, No),
    names_to = "assertionType",
    values_to = "assertionValueNumeric",
    values_drop_na = TRUE
  ) %>%
  mutate(
    assertionType = recode(assertionType,
                           DoD = "degree of digestion",
                           No = "individual count"),
    assertionID = case_when(
      assertionType == "degree of digestion" ~ str_c(materialEntityID, "_digestion"),
      assertionType == "individual count" ~ str_c(materialEntityID, "_count")),
    assertionTypeIRI = "",
    assertionValueIRI = "",
    assertionUnit = "",
    assertionUnitIRI = "",
    assertionValue = ""
  )

material_assertion <- rbind(fish_length_assertion, fish_lifestage_assertion, stomach_assertion) %>%
  filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/material-assertion.json")
material_assertion %>% filter(materialEntityID == "AAV3FF_00337")

Media

Material Media

material_media <- media_material %>%
  mutate(mediaSubjectCategory = "specimen photo") %>%
  filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/material-media.json")
head(material_media)

Protocol

Survey Protocol

As Event is also a Survey, I use Survey Protocol table instead of Event Protocol table.

survey_protocol <- survey_w_protocol %>% 
  select(surveyID, samplingProtocol) %>% 
  rename(protocolID = samplingProtocol) %>%
  filter(!is.na(protocolID))
survey_protocol %>% filter(surveyID == "BROKE_WEST_RMT_001_RMT8")

Identifier

Agent Identifier

agent_identifier <- tibble(
  identifier = "https://orcid.org/0000-0003-1336-5554",
  agentID = "https://orcid.org/0000-0003-1336-5554",
  identifierType = "ORCID",
  identifierLanguage = "eng"
) %>% filter_dataframe_by_json("https://rs.gbif.org/sandbox/experimental/data-packages/dwc-dp/0.1/table-schemas/agent-identifier.json")
agent_identifier

Darwin Core Data Package

Clean fish data

Main tables

Agent

Event

Material

Identification

Media

Survey

Protocol

Survey Target & Occurrence

Common tables

Tables not needed: Agent related tables

Assertion

Event Assertion

Material Assertion

Media

Material Media

Protocol

Survey Protocol

Identifier

Agent Identifier