Small herbaria contribute unique biogeographic records to county, locality, and temporal scales

This document contains code written by Erica Krimmel and Diana Soteropoulos to support data analysis of the NANSH research project (2013-2020). If you want to reuse this code, you should open this file in R and begin by loading packages, setting options, and declaring input data filenames. If you haven’t already installed the packages called, you will also need to do that.

# Load general packages
library(tidyverse)

# Load packages for working with geospatial data
library(rgeos)
library(rgdal)
library(maptools)

# Load packages for working with model data
library(car)
library(lme4)
library(lsr)
library(emmeans)

# Load packages for formatting in RMarkdown
library(kableExtra)

# Tell R not to use scientific notation
options(scipen = 999)

# The following variables should correlate to input data files in your working
# directory. Edit the filenames here if necessary:
input_specimens <- "Marsico-et-al-2020_specimens_2019-04-17.csv"
input_herbaria <- "Marsico-et-al-2020_herbaria_2019-06-08.csv"
input_species <- "Marsico-et-al-2020_species_2020-07-09.csv"

Prepare data

This first function prepares an analysis dataset by combining raw data from three compiled data tables (specimens, herbaria, species), filtering out low-quality records, and retaining a single representative record in the case of duplicate specimens. The function also assigns a “uniqueness flag” to each record based on whether its collecting event represents a unique county, unique locality, or unique time. You can use this function to generate different versions of the dataset based on what states you want to include, whether or not you want to include out-of-state herbaria (where we have that information), and what numerical break you want to use to define “small” vs. “large” herbaria (our default is 100,000 specimens).

# Set function and argument defaults
prepare_analysisSpms <- function(input_specimens, input_herbaria, input_species, 
                                     sizeCutoff = 100000,
                                     filterState = "all",
                                     includeOutOfState = FALSE) {
  
  # Record factor variables to use for naming future global variables and files
  recordedVariables <- paste(sizeCutoff, "_",
                    filterState, "_", 
                    includeOutOfState,
                    sep = "")
  filename_unique <- paste("output/Marsico-et-al-2020_uniqueSpms_", 
                    recordedVariables, "_",
                    Sys.Date(), 
                    ".csv", 
                    sep = "")
  filename_analysis <- paste("output/Marsico-et-al-2020_analysisSpms_", 
                    recordedVariables, "_",
                    Sys.Date(), 
                    ".csv", 
                    sep = "")
  filename_summary <- paste("output/Marsico-et-al-2020_analysisSummary_", 
                    recordedVariables, "_",
                    Sys.Date(), 
                    ".csv", 
                    sep = "")
  
  # Read in collated raw data
  input_specimensTbl <- read_csv(input_specimens, na = character())
  input_herbariaTbl <- read_csv(input_herbaria, na = character())
  input_speciesTbl <- read_csv(input_species, na = character())
  
  # Create a new column to assign sizeClass values to herbaria based on the
  # sizeCutoff input variable
  input_herbariaTbl <- input_herbariaTbl %>% 
    mutate(sizeClass = case_when(sizeNumber >= sizeCutoff ~ "large",
                                 sizeNumber < sizeCutoff ~ "small"))
  
  # Calculate the percentage of specimens held by large and small herbaria
  # within the institutions included in `input_herbariaTbl`
  totalSpmsSummary <- input_herbariaTbl %>% 
    group_by(sizeClass) %>% 
    summarize(totalSpecimens = sum(sizeNumber)) %>% 
    mutate(percentSpecimens = round(100*totalSpecimens/sum(totalSpecimens, 2)))
  
  # Rename `totalSpmsSummary` to include `recordedVariables` from function input
  assign(paste("totalSpmsSummary_", recordedVariables, sep = ""),
         totalSpmsSummary, envir = globalenv())
  
  # Assign variable dataFlag to tell the intermediary dataset `joinedSpms`
  # whether or not to include out of state herbaria
  if(includeOutOfState == FALSE) {
    dataFlag = "COUNTY-reassigned"
  } else if(includeOutOfState == TRUE) {
    dataFlag = c("COUNTY-reassigned", "HERBARIA-out-of-state",
                 "HERBARIA-out-of-state; COUNTY-reassigned")  
  } else {
    print("ERROR: Please assign a TRUE/FALSE value to the includeOutOfState
          variable in this function.")
  }
  
  # If the filterState variable is set to "all" update it based on actual state 
  # values in the dataset 
  if(filterState == "all"){
    filterState = levels(factor(input_herbariaTbl$state))
  } else {
    filterState = filterState
  }
  
  # Build dataset `joinedSpms`...
  joinedSpms <- input_specimensTbl %>%
    filter(state %in% filterState) %>% 
    
    # Join relevant columns from `input_speciesTbl` and `input_herbariaTbl`
    left_join(select(input_speciesTbl, index, status), 
              by = c("species_index"="index")) %>%
    left_join(select(input_herbariaTbl, index, herbName, sizeClass), 
              by = c("herbaria_index"="index")) %>% 
    
    # Create columns for countyState and dateRange
    unite(countyState, c(county, state), sep = "_", remove = FALSE) %>% 
    unite(dateRange, c(earlyDate, lateDate), sep = " to ", remove = FALSE) %>% 
    
    # Remove rows that have already been marked as low quality by filtering out
    # those with data flags other than those allowed by the variable dataFlag
    filter(DATA_FLAG == "" | DATA_FLAG %in% dataFlag) %>% 
    
    # Group duplicate specimens and create a temporary column groupIndex
    unite(duplicateGroup, 
          c(species_index, countyState, dateRange, collectorStandard), 
          sep = "|", remove = FALSE) %>% 
    mutate(groupIndex = group_indices(.,duplicateGroup)) %>% 
    select(-duplicateGroup)
  
  # Build dataset `duplicateSpms` to record size classes of the herbaria holding
  # each set of duplicates
  duplicateSpms <- joinedSpms %>% 
    group_by(groupIndex) %>%
    arrange(desc(sizeClass)) %>% 
    summarise(duplicateStatus = paste(sizeClass, collapse = ","))
  
  # Add results from `duplicateSpms` back into full dataset and deal with
  # duplicate rows: if duplicates within a set are shared between large and
  # small collections then discard all rows, if duplicates within a set are held
  # by only small or only large herbaria then retain one representative from the
  # set and discard the remaining duplicate rows.
  uniqueSpms <- joinedSpms %>% 
    left_join(duplicateSpms, by = "groupIndex") %>% 
    group_by(groupIndex) %>% 
    # Make sure to retain a row with coordinate data, if possible
    arrange(latitude) %>% 
    filter(row_number()==1)
  
  # Rename `uniqueSpms` to include `recordedVariables` from function input
  assign(paste("uniqueSpms_", recordedVariables, sep = ""),
         uniqueSpms, envir = globalenv())
  
  # Output result to CSV file
  write_csv(uniqueSpms, filename_unique)
  
  # Report on how large and small collections share duplicate specimens
  duplicateSpmsSummary <- uniqueSpms %>% 
    mutate(duplicateType = case_when(duplicateStatus == "large" ~ 
                              "unduplicated",
                            duplicateStatus == "small" ~ 
                              "unduplicated",  
                            str_detect(duplicateStatus, "small,large") ~
                              "duplicated by large and small herbaria",
                            !str_detect(duplicateStatus, "large") ~
                              "duplicated only by small herbaria",
                            !str_detect(duplicateStatus, "small") ~
                              "duplicated only by large herbaria")) %>% 
    group_by(sizeClass, duplicateType) %>% 
    tally() %>% 
    pivot_wider(names_from = sizeClass, values_from = n) %>% 
    rename(largeCount = large, smallCount = small) %>% 
    mutate(largePercent = round(largeCount/sum(largeCount, na.rm = TRUE)*100, 2)) %>% 
    mutate(smallPercent = round(smallCount/sum(smallCount, na.rm = TRUE)*100, 2))
  
  # Rename `duplicateSpmsSummary` to include `recordedVariables` from function
  # input
  assign(paste("duplicateSpmsSummary_", recordedVariables, sep = ""),
         duplicateSpmsSummary, envir = globalenv())
  
  # Determine what the expected proportion of specimens held by large vs. small
  # herbaria is
  expectedProportion <- uniqueSpms %>% 
    group_by(sizeClass) %>% 
    tally() %>% 
    mutate(freq = n/sum(n)) %>% 
    pull(freq)
  
  # Pull out expected proportion of just small herbaria
  expectedProportionSmall <- expectedProportion[2]
  
  # Rename `expectedProportionSmall` to include `recordedVariables` from function
  # input
  assign(paste("expectedProportion_", recordedVariables, sep = ""),
         expectedProportion, envir = globalenv())
  
  # Remove rows without coordinates from uniqueSpms to begin the process of
  # evaluating which specimens were collected within 1km of each other (i.e. at
  # non-unique localties)
  geo_uniqueSpms <- uniqueSpms %>% filter(!is.na(longitude))
  
  # Create spatial data frame to use in geographic analysis below
  geo_spdf <- SpatialPointsDataFrame(coords = cbind(geo_uniqueSpms$longitude,
                                                    geo_uniqueSpms$latitude),
                                     data = geo_uniqueSpms, 
                                     proj4string = 
                                       CRS("+proj=longlat +datum=WGS84"))
  
  # Project data in Conus Albers (NAD83) so that we can calculate 1km buffers
  geo_project <- spTransform(geo_spdf, CRS("+init=epsg:5070"))
  
  # Create new spatial data frame with 1km polygons buffering each point
  geo_buffer <- gBuffer(geo_project, byid=TRUE, width=1000)
  
  # Join attributes by location to determine which buffers intersect each other,
  # then return a count of how many points lie within any given 1km buffer
  geo_intersect <- over(geo_buffer, geo_buffer[, "index"], returnList = TRUE)
  geo_count <- sapply(geo_intersect, nrow)
  geo_bound <- spCbind(geo_project, geo_count)
  geo_data <- geo_bound@data %>% 
    mutate(nearbySpms = geo_count - 1) %>% 
    select(-geo_count)
  
  # Add geospatial analysis results `geo_data` back into full `uniqueSpms` data
  analysisSpms <- uniqueSpms %>% 
    left_join(select(geo_data, index, nearbySpms), by = "index") 
  
  # Flag unique counties
  groupCounty <- uniqueSpms %>%
    group_by(species_index, countyState) %>% tally() %>%
    mutate(unique_county = case_when(n == 1 ~ "unique_county")) %>%
    ungroup()
  
  # Flag unique localities and unique times
  analysisSpms <- analysisSpms %>%
    filter(!str_detect(duplicateStatus, "small,large" )) %>%
    left_join(groupCounty, by = c("species_index", "countyState")) %>% 
    mutate(unique_locality = case_when(nearbySpms == 0 ~ "unique_locality")) %>%
    unite("uniqueness", c("unique_county", "unique_locality"), sep = "/") %>%
    mutate(uniqueness = sub("unique_county/unique_locality", "unique_county", 
                            uniqueness)) %>%
    mutate(uniqueness = sub("unique_county/NA", "unique_county", 
                            uniqueness)) %>%
    mutate(uniqueness = sub("NA/unique_locality", "unique_locality",
                            uniqueness)) %>%
    mutate(uniqueness = sub("NA/NA", "unique_time", 
                            uniqueness)) %>% 
    unite("check", c("uniqueness", "dateRange"), sep = "/", remove = FALSE) %>% 
    filter(check != "unique_time/\"1800\" to \"2015\"") %>% 
    select(-n, -check) %>% 
    ungroup
  
  # Rename `analysisSpms` to include `recordedVariables` from function input
  assign(paste("analysisSpms_", recordedVariables, sep = ""),
         analysisSpms, envir = globalenv())
    
  # Output result to CSV file
  write_csv(analysisSpms, filename_analysis)
  
  # Group `analysisSpms` data for use in figures
  analysisSummary <- analysisSpms %>% 
    group_by(uniqueness, state, status, sizeClass) %>% 
    tally() %>% 
    group_by(uniqueness, state, status) %>% 
    mutate(frequency = round((n/sum(n)),2)) %>% 
    rename(count = n)

  # Rename `analysisSummary` to include `recordedVariables` from function input
  assign(paste("analysisSummary_", recordedVariables, sep = ""),
         analysisSummary, envir = globalenv())
    
  # Output result to CSV file; note that rows may be missing where no unique
  # specimens were found and that this is resolved in the code below for
  # purposes of graphing results
  write_csv(analysisSummary, filename_summary)
}

# Run function
prepare_analysisSpms(input_specimens, input_herbaria, input_species)

We get several resulting objects by running the function above with our default values (i.e. a size cutoff of 100000 specimens to differentiate between small and large herbaria, and including data from all states but no out-of-state herbaria). The following objects now exist in our R environment…

totalSpmsSummary_100000_all_FALSE is a summary of how many total specimens are held by the herbaria participating in this project:

sizeClass	totalSpecimens	percentSpecimens
large	13093200	86
small	2073533	14

duplicateSpmsSummary_100000_all_FALSE is a summary of how many specimens in the uniqueSpms dataset represent duplicates in the original data (i.e. the input_specimens):

duplicateType	largeCount	smallCount	largePercent	smallPercent
duplicated by large and small herbaria	289	130	2.55	2.60
duplicated only by large herbaria	1635	NA	14.42	NA
unduplicated	9415	4456	83.03	88.96
duplicated only by small herbaria	NA	423	NA	8.44

# Manually add rows that do not exist on `analysisSummary_100000_all_FALSE`
# because there was no data; this will let us effectively use this for graphing
analysisSummary_100000_all_FALSE <- analysisSummary_100000_all_FALSE %>% 
  ungroup %>% 
  add_row(uniqueness="unique_county",
          state="AR",
          status="S1",
          sizeClass="large",
          count=0,
          frequency=0) %>% 
  add_row(uniqueness="unique_county",
          state="TN",
          status="S1",
          sizeClass="small",
          count=0,
          frequency=0) %>% 
  add_row(uniqueness="unique_locality",
          state="AR",
          status="S1",
          sizeClass="large",
          count=0,
          frequency=0) %>% 
  add_row(uniqueness="unique_locality",
          state="WV",
          status="S1",
          sizeClass="small",
          count=0,
          frequency=0) %>% 
  add_row(uniqueness="unique_time",
          state="AR",
          status="S1",
          sizeClass="large",
          count=0,
          frequency=0) %>% 
  arrange(uniqueness, state, status, sizeClass) %>% 
  mutate(uniqueness = str_replace(uniqueness, "_", " "))

# Add factors for ordering of `analysisSummary_100000_all_FALSE` 
# categories in graph below
analysisSummary_100000_all_FALSE$status2 <- factor(
  analysisSummary_100000_all_FALSE$status, 
  levels = c("S1", "S2", "native", "introduced"))
  
# Graph unique specimen records in each of our four categories by species
# status and herbarium size class
analysisSummary_100000_all_FALSE %>% 
  ggplot(aes(x = status2, y = count, label = count)) +
  geom_bar(aes(fill = sizeClass),
           stat="identity", width = 0.5, position = "dodge") +
  scale_fill_manual(values=c("#003f5c", "#ffa600")) +
  scale_y_continuous() +
  facet_grid(uniqueness ~ state, scales = "free_y") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  labs(color = "Size class",
       fill = "Herbarium size",
       x = "Species status",
       y = "Specimen records")

# Save graph to file
ggsave(paste("output/Marsico-et-al-2020_summaryFigure_100000_all_FALSE_", 
             Sys.Date(), 
             ".png", 
             sep = ""),
       width = 7.25,
       height = 5)

analysisSummary_100000_all_FALSE is a summarized version of the analysisSpms data to use for analysis and graphing:

uniqueness	state	status	sizeClass	count	frequency	status2
unique county	AR	introduced	large	55	0.65	introduced
unique county	AR	introduced	small	30	0.35	introduced
unique county	AR	native	large	77	0.51	native
unique county	AR	native	small	75	0.49	native
unique county	AR	S1	large	0	0.00	S1
unique county	AR	S1	small	6	1.00	S1
unique county	AR	S2	large	4	0.19	S2
unique county	AR	S2	small	17	0.81	S2
unique county	CA	introduced	large	28	0.76	introduced
unique county	CA	introduced	small	9	0.24	introduced
unique county	CA	native	large	14	0.67	native
unique county	CA	native	small	7	0.33	native
unique county	CA	S1	large	7	0.64	S1
unique county	CA	S1	small	4	0.36	S1
unique county	CA	S2	large	8	0.47	S2
unique county	CA	S2	small	9	0.53	S2
unique county	CO	introduced	large	34	0.68	introduced
unique county	CO	introduced	small	16	0.32	introduced
unique county	CO	native	large	47	0.76	native
unique county	CO	native	small	15	0.24	native
unique county	CO	S1	large	1	0.50	S1
unique county	CO	S1	small	1	0.50	S1
unique county	CO	S2	large	4	0.40	S2
unique county	CO	S2	small	6	0.60	S2
unique county	FL	introduced	large	57	0.93	introduced
unique county	FL	introduced	small	4	0.07	introduced
unique county	FL	native	large	47	0.90	native
unique county	FL	native	small	5	0.10	native
unique county	FL	S1	large	11	0.92	S1
unique county	FL	S1	small	1	0.08	S1
unique county	FL	S2	large	17	0.74	S2
unique county	FL	S2	small	6	0.26	S2
unique county	GA	introduced	large	145	0.58	introduced
unique county	GA	introduced	small	105	0.42	introduced
unique county	GA	native	large	110	0.69	native
unique county	GA	native	small	50	0.31	native
unique county	GA	S1	large	6	0.67	S1
unique county	GA	S1	small	3	0.33	S1
unique county	GA	S2	large	19	0.56	S2
unique county	GA	S2	small	15	0.44	S2
unique county	MI	introduced	large	87	0.72	introduced
unique county	MI	introduced	small	33	0.28	introduced
unique county	MI	native	large	107	0.76	native
unique county	MI	native	small	33	0.24	native
unique county	MI	S1	large	11	0.92	S1
unique county	MI	S1	small	1	0.08	S1
unique county	MI	S2	large	33	0.77	S2
unique county	MI	S2	small	10	0.23	S2
unique county	TN	introduced	large	130	0.73	introduced
unique county	TN	introduced	small	47	0.27	introduced
unique county	TN	native	large	160	0.83	native
unique county	TN	native	small	33	0.17	native
unique county	TN	S1	large	8	1.00	S1
unique county	TN	S1	small	0	0.00	S1
unique county	TN	S2	large	18	0.78	S2
unique county	TN	S2	small	5	0.22	S2
unique county	WV	introduced	large	176	0.99	introduced
unique county	WV	introduced	small	2	0.01	introduced
unique county	WV	native	large	67	0.97	native
unique county	WV	native	small	2	0.03	native
unique county	WV	S1	large	18	0.95	S1
unique county	WV	S1	small	1	0.05	S1
unique county	WV	S2	large	38	0.88	S2
unique county	WV	S2	small	5	0.12	S2
unique locality	AR	introduced	large	13	0.48	introduced
unique locality	AR	introduced	small	14	0.52	introduced
unique locality	AR	native	large	126	0.55	native
unique locality	AR	native	small	103	0.45	native
unique locality	AR	S1	large	0	0.00	S1
unique locality	AR	S1	small	1	1.00	S1
unique locality	AR	S2	large	2	0.13	S2
unique locality	AR	S2	small	13	0.87	S2
unique locality	CA	introduced	large	355	0.76	introduced
unique locality	CA	introduced	small	113	0.24	introduced
unique locality	CA	native	large	499	0.86	native
unique locality	CA	native	small	84	0.14	native
unique locality	CA	S1	large	111	0.97	S1
unique locality	CA	S1	small	4	0.03	S1
unique locality	CA	S2	large	42	0.95	S2
unique locality	CA	S2	small	2	0.05	S2
unique locality	CO	introduced	large	83	0.64	introduced
unique locality	CO	introduced	small	47	0.36	introduced
unique locality	CO	native	large	258	0.76	native
unique locality	CO	native	small	83	0.24	native
unique locality	CO	S1	large	30	0.86	S1
unique locality	CO	S1	small	5	0.14	S1
unique locality	CO	S2	large	48	0.64	S2
unique locality	CO	S2	small	27	0.36	S2
unique locality	FL	introduced	large	167	0.91	introduced
unique locality	FL	introduced	small	16	0.09	introduced
unique locality	FL	native	large	221	0.90	native
unique locality	FL	native	small	25	0.10	native
unique locality	FL	S1	large	33	0.89	S1
unique locality	FL	S1	small	4	0.11	S1
unique locality	FL	S2	large	27	0.69	S2
unique locality	FL	S2	small	12	0.31	S2
unique locality	GA	introduced	large	47	0.35	introduced
unique locality	GA	introduced	small	88	0.65	introduced
unique locality	GA	native	large	114	0.80	native
unique locality	GA	native	small	28	0.20	native
unique locality	GA	S1	large	2	0.67	S1
unique locality	GA	S1	small	1	0.33	S1
unique locality	GA	S2	large	7	0.47	S2
unique locality	GA	S2	small	8	0.53	S2
unique locality	MI	introduced	large	119	0.60	introduced
unique locality	MI	introduced	small	78	0.40	introduced
unique locality	MI	native	large	219	0.66	native
unique locality	MI	native	small	114	0.34	native
unique locality	MI	S1	large	2	0.67	S1
unique locality	MI	S1	small	1	0.33	S1
unique locality	MI	S2	large	20	0.74	S2
unique locality	MI	S2	small	7	0.26	S2
unique locality	TN	introduced	large	86	0.58	introduced
unique locality	TN	introduced	small	63	0.42	introduced
unique locality	TN	native	large	191	0.72	native
unique locality	TN	native	small	73	0.28	native
unique locality	TN	S1	large	5	0.71	S1
unique locality	TN	S1	small	2	0.29	S1
unique locality	TN	S2	large	20	0.83	S2
unique locality	TN	S2	small	4	0.17	S2
unique locality	WV	introduced	large	97	0.98	introduced
unique locality	WV	introduced	small	2	0.02	introduced
unique locality	WV	native	large	20	0.77	native
unique locality	WV	native	small	6	0.23	native
unique locality	WV	S1	large	7	1.00	S1
unique locality	WV	S1	small	0	0.00	S1
unique locality	WV	S2	large	36	0.82	S2
unique locality	WV	S2	small	8	0.18	S2
unique time	AR	introduced	large	23	0.39	introduced
unique time	AR	introduced	small	36	0.61	introduced
unique time	AR	native	large	145	0.56	native
unique time	AR	native	small	112	0.44	native
unique time	AR	S1	large	0	0.00	S1
unique time	AR	S1	small	3	1.00	S1
unique time	AR	S2	large	10	0.26	S2
unique time	AR	S2	small	29	0.74	S2
unique time	CA	introduced	large	499	0.75	introduced
unique time	CA	introduced	small	163	0.25	introduced
unique time	CA	native	large	664	0.86	native
unique time	CA	native	small	105	0.14	native
unique time	CA	S1	large	188	0.88	S1
unique time	CA	S1	small	26	0.12	S1
unique time	CA	S2	large	160	0.89	S2
unique time	CA	S2	small	20	0.11	S2
unique time	CO	introduced	large	158	0.57	introduced
unique time	CO	introduced	small	117	0.43	introduced
unique time	CO	native	large	319	0.54	native
unique time	CO	native	small	274	0.46	native
unique time	CO	S1	large	46	0.61	S1
unique time	CO	S1	small	30	0.39	S1
unique time	CO	S2	large	69	0.56	S2
unique time	CO	S2	small	54	0.44	S2
unique time	FL	introduced	large	423	0.82	introduced
unique time	FL	introduced	small	96	0.18	introduced
unique time	FL	native	large	373	0.80	native
unique time	FL	native	small	94	0.20	native
unique time	FL	S1	large	129	0.74	S1
unique time	FL	S1	small	46	0.26	S1
unique time	FL	S2	large	202	0.85	S2
unique time	FL	S2	small	35	0.15	S2
unique time	GA	introduced	large	124	0.34	introduced
unique time	GA	introduced	small	239	0.66	introduced
unique time	GA	native	large	151	0.65	native
unique time	GA	native	small	80	0.35	native
unique time	GA	S1	large	9	0.64	S1
unique time	GA	S1	small	5	0.36	S1
unique time	GA	S2	large	18	0.82	S2
unique time	GA	S2	small	4	0.18	S2
unique time	MI	introduced	large	408	0.61	introduced
unique time	MI	introduced	small	262	0.39	introduced
unique time	MI	native	large	872	0.68	native
unique time	MI	native	small	418	0.32	native
unique time	MI	S1	large	41	0.71	S1
unique time	MI	S1	small	17	0.29	S1
unique time	MI	S2	large	99	0.80	S2
unique time	MI	S2	small	24	0.20	S2
unique time	TN	introduced	large	254	0.42	introduced
unique time	TN	introduced	small	347	0.58	introduced
unique time	TN	native	large	505	0.59	native
unique time	TN	native	small	344	0.41	native
unique time	TN	S1	large	42	0.40	S1
unique time	TN	S1	small	64	0.60	S1
unique time	TN	S2	large	65	0.52	S2
unique time	TN	S2	small	60	0.48	S2
unique time	WV	introduced	large	214	0.74	introduced
unique time	WV	introduced	small	74	0.26	introduced
unique time	WV	native	large	71	0.65	native
unique time	WV	native	small	39	0.35	native
unique time	WV	S1	large	35	0.92	S1
unique time	WV	S1	small	3	0.08	S1
unique time	WV	S2	large	80	0.71	S2
unique time	WV	S2	small	32	0.29	S2

We also have uniqueSpms_100000_all_FALSE, which is a comprehensive dataset of the unique specimen records included in this project, and analysisSpms_100000_all_FALSE, which is the primary dataset generated by this function to be used in the rest of the project’s analysis. This function writes a CSV version of the uniqueSpms, analysisSpms, and analysisSummary objects to the output folder in your working directory. It also generates and saves a figure for the analysisSummary data, as shown here:

We can use the analysisSpms data to visualize different facets of the project, e.g. this figure illustrating how many specimens were contributed by large versus small herbaria in each state.

Model data

In modeling the data, we wanted to ask what factors predict uniqueness of a specimen at a given uniqueness category scale (county, locality, or time). We are analyzing the data using multiple logistic regression because we have a nominal response variable (“Is the specimen unique? Yes/No”) as well as nominal predictor variables: size class (small, large), species status (S1, S2, native, introduced), and state (AR, CA, CO, FL, GA, MI, TN, WV). To model this, we are using a generalized linear mixed-effects model (GLMM), with state as a random variable.

# Set dummy coding function to transform analysis dataset by splitting up our
# response variable (uniqueness) into three levels so we can analyze them
# individually: unique_county, unique_locality, unique_time
model_analysisSpms <- function(analysisSpms) {
  
  # Extract input variables to use for naming future global variables and files
  recordedVariables <- deparse(substitute(analysisSpms)) %>% 
    str_remove("analysisSpms_")
  filename_results <- paste("output/Marsico-et-al-2020_modelResults_", 
                    recordedVariables, "_",
                    Sys.Date(), 
                    ".csv", 
                    sep = "")
  
  # Set generic function to convert a level of a column into a new binary column
  convert_toLogicals <- function(data, column, value) {
    data %>% mutate(!!value := as.numeric(case_when(str_detect(column, value)==
                                                      TRUE ~ "1", 
                                                    str_detect(column, value)==
                                                      FALSE ~ "0")))
  }
  
  # Get levels from the response variable column
  levels_uniqueness <- levels(factor(analysisSpms$uniqueness))
  
  # Use the `convert_toLogicals` function to generate new binary columns based
  # on the nominal values of the response column
  analysisSpms_binary <- analysisSpms %>% 
    select(index, uniqueness, sizeClass, status, state)
  
  for (i in seq_along(levels_uniqueness)) {
    analysisSpms_binary <- analysisSpms_binary %>% 
      convert_toLogicals(.$uniqueness, levels_uniqueness[i])
  }

  # Return this new binary version of the dataset as our model data
  modelData <- analysisSpms_binary %>% 
    select(index,
           unique_county,
           unique_locality,
           unique_time,
           everything(),
           -uniqueness)
  
  # Save `modelData` in the global environment
  assign(paste("modelData_", recordedVariables, sep = ""),
         modelData, envir = globalenv())
  
  # Include state as a random variable for all states, but not for
  # specific states
  if(grepl("all", recordedVariables)) {
    
    #Generate models for county-level uniqueness
    county_m1 <- glmer(data = modelData,
                       unique_county ~ sizeClass + (1|state),
                       family = binomial(link="logit"))
    county_m2 <- glmer(data = modelData,
                       unique_county ~ status + (1|state), 
                       family = binomial(link="logit"))
    county_mGLOBAL <- glmer(data = modelData,
                            unique_county ~ sizeClass + status+ (1|state), 
                            family = binomial(link="logit"))
    county_mNULL <- glmer(data = modelData,
                          unique_county ~ 1 + (1|state), 
                          family = binomial(link="logit"))
    
    # Generate models for locality-level uniqueness
    locality_m1 <- glmer(data = modelData,
                         unique_locality ~ sizeClass + (1|state),
                         family = binomial(link="logit"))
    locality_m2 <- glmer(data = modelData,
                         unique_locality ~ status + (1|state), 
                         family = binomial(link="logit"))
    locality_mGLOBAL <- glmer(data = modelData,
                              unique_locality ~ sizeClass + status + (1|state), 
                              family = binomial(link="logit"))
    locality_mNULL <- glmer(data = modelData,
                            unique_locality ~ 1 + (1|state), 
                            family = binomial(link="logit"))
    
    # Generate models for time-level uniqueness
    time_m1 <- glmer(data = modelData,
                     unique_time ~ sizeClass + (1|state),
                     family = binomial(link="logit"))
    time_m2 <- glmer(data = modelData,
                     unique_time ~ status + (1|state), 
                     family = binomial(link="logit"))
    time_mGLOBAL <- glmer(data = modelData,
                          unique_time ~ sizeClass + status + (1|state), 
                          family = binomial(link="logit"))
    time_mNULL <- glmer(data = modelData,
                        unique_time ~ 1 + (1|state), 
                        family = binomial(link="logit"))
  } else if(!grepl("all", recordedVariables)) {
     
    # Generate models for county-level uniqueness
    county_m1 <- glmer(data = modelData,
                       unique_county ~ sizeClass,
                       family = binomial(link="logit"))
    county_m2 <- glmer(data = modelData,
                       unique_county ~ status, 
                       family = binomial(link="logit"))
    county_mGLOBAL <- glmer(data = modelData,
                            unique_county ~ sizeClass + status, 
                            family = binomial(link="logit"))
    county_mNULL <- glmer(data = modelData,
                          unique_county ~ 1, 
                          family = binomial(link="logit"))
    
    # Generate models for locality-level uniqueness
    locality_m1 <- glmer(data = modelData,
                         unique_locality ~ sizeClass,
                         family = binomial(link="logit"))
    locality_m2 <- glmer(data = modelData,
                         unique_locality ~ status, 
                         family = binomial(link="logit"))
    locality_mGLOBAL <- glmer(data = modelData,
                              unique_locality ~ sizeClass + status, 
                              family = binomial(link="logit"))
    locality_mNULL <- glmer(data = modelData,
                            unique_locality ~ 1, 
                            family = binomial(link="logit"))
    
    # Generate models for time-level uniqueness
    time_m1 <- glmer(data = modelData,
                     unique_time ~ sizeClass,
                     family = binomial(link="logit"))
    time_m2 <- glmer(data = modelData,
                     unique_time ~ status, 
                     family = binomial(link="logit"))
    time_mGLOBAL <- glmer(data = modelData,
                          unique_time ~ sizeClass + status, 
                          family = binomial(link="logit"))
    time_mNULL <- glmer(data = modelData,
                        unique_time ~ 1, 
                        family = binomial(link="logit"))
  } else {
    print("ERROR: There is a problem with the state/s being passed into this
          function via the analysisSpms dataframe name.")
  }
  
  # Pull results out of model objects so that we can compare more easily
  modelResults <- tribble(
    ~responseVariable, ~modelName, ~modelFormula, ~AIC,
    "unique_county", "m1", deparse(formula(county_m1)), AIC(county_m1),
    "unique_county", "m2", deparse(formula(county_m2)), AIC(county_m2),
    "unique_county", "mGLOBAL", deparse(formula(county_mGLOBAL)), AIC(county_mGLOBAL),
    "unique_county", "mNULL", deparse(formula(county_mNULL)), AIC(county_mNULL),
    "unique_locality", "m1", deparse(formula(locality_m1)), AIC(locality_m1),
    "unique_locality", "m2", deparse(formula(locality_m2)), AIC(locality_m2),
    "unique_locality", "mGLOBAL", deparse(formula(locality_mGLOBAL)), AIC(locality_mGLOBAL),
    "unique_locality", "mNULL", deparse(formula(locality_mNULL)),AIC(locality_mNULL),
    "unique_time", "m1", deparse(formula(time_m1)), AIC(time_m1),
    "unique_time", "m2", deparse(formula(time_m2)),AIC(time_m2),
    "unique_time", "mGLOBAL", deparse(formula(time_mGLOBAL)), AIC(time_mGLOBAL),
    "unique_time", "mNULL", deparse(formula(time_mNULL)), AIC(time_mNULL))
  
  # Rename global models for each uniqueness level to include
  # `recordedVariables` from function input and save to global environment
  # for testing fit
  assign(paste("county_mGLOBAL_", recordedVariables, sep = ""),
         county_mGLOBAL, envir = globalenv())
  assign(paste("locality_mGLOBAL_", recordedVariables, sep = ""),
         locality_mGLOBAL, envir = globalenv())
  assign(paste("time_mGLOBAL_", recordedVariables, sep = ""),
         time_mGLOBAL, envir = globalenv())
  
  # Calculate delta AIC and AIC weights for each uniqueness level 
  modelResults <- modelResults %>% 
    group_by(responseVariable) %>% 
    mutate(dAIC = round(AIC - min(AIC), 1)) %>% 
    mutate(AIC = round(AIC, 0)) %>% 
    mutate(weight = round(exp(- 0.5 * dAIC)/sum(exp(- 0.5 * dAIC)), 2))
  
  # Rename `modelResults` to include `recordedVariables` from function input
  assign(paste("modelResults_", recordedVariables, sep = ""),
         modelResults, envir = globalenv())
  
  # Output result to CSV file
  write_csv(modelResults, filename_results)
}

# Run function for default analysis variables
model_analysisSpms(analysisSpms_100000_all_FALSE)

Our new reformatted modelData looks like this:

index	unique_time	sizeClass	status	state
823	1	large	introduced	FL
1994	1	large	introduced	FL
902	1	large	S2	FL
1205	1	large	S2	FL
1962	1	large	introduced	FL
2088	1	large	introduced	FL

We will use modelData to generate our generalized linear mixed-effects models using the lme4::glmer function. Note that we are analyzing each of the three scales of uniqueness (county, locality, time) separately.

Each of these models is a complex object, for example, here is the summary of the global model for county-scale uniqueness (county_mGLOBAL) using our default variables (i.e. a size cutoff of 100000 specimens to differentiate between small and large herbaria, and including data from all states but no out-of-state herbaria):

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: unique_county ~ sizeClass + status + (1 | state)
##    Data: modelData
## 
##      AIC      BIC   logLik deviance df.resid 
##  10957.7  11003.7  -5472.8  10945.7    15785 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -0.8749 -0.4242 -0.2774 -0.1645  8.5505 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  state  (Intercept) 0.9883   0.9941  
## Number of obs: 15791, groups:  state, 8
## 
## Fixed effects:
##                Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)    -1.46498    0.34972  -4.189 0.000028012366874403 ***
## sizeClasssmall -0.61273    0.05742 -10.670 < 0.0000000000000002 ***
## statusnative   -0.45058    0.05565  -8.096 0.000000000000000566 ***
## statusS1       -0.38045    0.12823  -2.967              0.00301 ** 
## statusS2       -0.08012    0.08700  -0.921              0.35712    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) szClss sttsnt sttsS1
## sizClsssmll -0.048                     
## statusnativ -0.081  0.114              
## statusS1    -0.034  0.020  0.196       
## statusS2    -0.048  0.016  0.293  0.136

We can summarize the relevant results from each model so that they are easier to compare:

responseVariable	modelName	modelFormula	AIC	dAIC	weight
unique_county	m1	unique_county ~ sizeClass + (1 \| state)	11022	64.4	0
unique_county	m2	unique_county ~ status + (1 \| state)	11076	118.3	0
unique_county	mGLOBAL	unique_county ~ sizeClass + status + (1 \| state)	10958	0.0	1
unique_county	mNULL	unique_county ~ 1 + (1 \| state)	11123	165.5	0
unique_locality	m1	unique_locality ~ sizeClass + (1 \| state)	17422	90.0	0
unique_locality	m2	unique_locality ~ status + (1 \| state)	17365	32.7	0
unique_locality	mGLOBAL	unique_locality ~ sizeClass + status + (1 \| state)	17332	0.0	1
unique_locality	mNULL	unique_locality ~ 1 + (1 \| state)	17459	126.9	0
unique_time	m1	unique_time ~ sizeClass + (1 \| state)	20460	52.8	0
unique_time	m2	unique_time ~ status + (1 \| state)	20566	158.5	0
unique_time	mGLOBAL	unique_time ~ sizeClass + status + (1 \| state)	20408	0.0	1
unique_time	mNULL	unique_time ~ 1 + (1 \| state)	20616	208.8	0

Now that we’ve built our models we need to verify that they are appropriate for us to use. First, we should check the global models to make sure that none of our predictor variables are correlated. We can evaluate this by looking at the variance inflation factor for each uniqueness scale.

Unique county: 1.0135835, 1.0135835, 1, 3, 1.0067688, 1.0022512
Unique locality: 1.006368, 1.006368, 1, 3, 1.003179, 1.0010585
Unique time: 1.0078161, 1.0078161, 1, 3, 1.0039004, 1.0012985

The low (<2.5) GVIF values above confirm that our predictor variables are not correlated in any of the models.

We will also look at the coefficient values for Cramer’s V to further confirm our model by determining that there are no substantive associations between predictor variables:

Cramer’s V for size class x species status: 0.0703964
Cramer’s V for size class x state: 0.2522923
Cramer’s V for state x species status: 0.173137

The above values are all either below 0.1 (indicating little to no association between variables) or below 0.3 (indicating that there is a low association at the most). These values meet our threshold.

Graph results

We can also visualize our models to help us interpret these results. In the figures below, we are comparing our observed (from the data) and predicted (from the model) probabilities that a specimen represents unique information at a given biogeographic scale.

# Set function
graph_analysisResults <- function(analysisSummary,
                                  county_mGLOBAL,
                                  locality_mGLOBAL,
                                  time_mGLOBAL) {
  
  # Capture variables from the name of data object passed into function
  recordedVariables <- deparse(substitute(analysisSummary)) %>% 
    str_replace("analysisSummary_", "")

  # Calculate the standard deviation, standard error, and 95% confidence
  # intervals on each proportion of specimens from small vs. large herbaria,
  # grouped by uniqueness and status but with all states combined
  observedFigure <- analysisSummary %>% 
    ungroup() %>% 
    select(-frequency) %>% 
    mutate(uniqueness = toupper(str_replace(uniqueness, "_", " "))) %>% 
    group_by(uniqueness, status, sizeClass) %>% 
    summarize(groupSum = sum(count)) %>% 
    group_by(status, sizeClass) %>% 
    mutate(prop = round(groupSum/sum(groupSum), 2))
    
  # Add factors for ordering of `observedFigure` categories in graph below
  observedFigure$status2 <- factor(observedFigure$status, 
                                   levels = c("S1", 
                                              "S2", 
                                              "native", 
                                              "introduced"))
  # Graph `observedFigure` for Panel A of final figure
  ggplot(observedFigure, aes(x = status2,
                             y = groupSum,
                             label = groupSum,
                             fill= sizeClass)) +
    geom_bar(aes(),
             stat="identity", 
             position = position_dodge()) +
    scale_fill_manual(values=c("#003f5c", "#ffa600")) +
    scale_y_continuous() +
    facet_wrap(~ uniqueness) +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
    labs(color = "Size class",
         fill = "Herbarium size",
         x = "Species status",
         y = "Observed specimen records")
  
  # Save graph to PNG file
  ggsave(paste("output/Marsico-et-al-2020_observedFigureCount_", 
               recordedVariables, "_",
               Sys.Date(), 
               ".png", 
               sep = ""),
         width = 8,
         height = 4)
  
  # Graph `observedFigure` for Panel B of final figure
  ggplot(observedFigure, aes(x = status2,
                             y = prop,
                             label = prop,
                             fill= sizeClass)) +
    geom_bar(aes(),
             stat="identity", 
             position = position_dodge()) +
    scale_fill_manual(values=c("#003f5c", "#ffa600")) +
    scale_y_continuous() +
    facet_wrap(~ uniqueness) +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
    labs(color = "Size class",
         fill = "Herbarium size",
         x = "Species status",
         y = "Probability observed in data")
  
  # Save graph to PNG file
  ggsave(paste("output/Marsico-et-al-2020_observedFigureProp_", 
               recordedVariables, "_",
               Sys.Date(), 
               ".png", 
               sep = ""),
         width = 8,
         height = 4)
  
  # Calculate estimated marginal means for global models; use "type = response"
  # to back transform our results from the logit link function we used when
  # creating the models
  predictedFigure_county <- summary(emmeans(county_mGLOBAL, 
                                            c("sizeClass", "status"), 
                                            type = "response")) %>% 
    mutate(uniqueness = "UNIQUE COUNTY")
  
  predictedFigure_locality <- summary(emmeans(locality_mGLOBAL, 
                                              c("sizeClass", "status"), 
                                              type = "response")) %>% 
    mutate(uniqueness = "UNIQUE LOCALITY")
  
  predictedFigure_time <- summary(emmeans(time_mGLOBAL, 
                                          c("sizeClass", "status"), 
                                          type = "response")) %>% 
    mutate(uniqueness = "UNIQUE TIME")
  
  # Combine the emmeans results from all three uniqueness levels into a single
  # data frame for graphing
  predictedFigure <- rbind(predictedFigure_county,
                           predictedFigure_locality,
                           predictedFigure_time)
  
  
  # Add factors for ordering of `predictedFigure` categories in graph below
  predictedFigure$status2 <- factor(predictedFigure$status, 
                                   levels = c("S1", 
                                              "S2", 
                                              "native", 
                                              "introduced"))
  
  # Graph `predictedFigure` for Panel C of final figure
  ggplot(predictedFigure, aes(x = status2,
                              y = prob, 
                              label = prob,
                              fill= sizeClass)) +
    geom_bar(aes(),
             stat="identity", 
             position = position_dodge()) +
    geom_errorbar(aes(ymin = prob - SE,
                      ymax = prob + SE),
                  color = "gray",
                  width = 0.2,
                  position = position_dodge(0.9)) +
    scale_fill_manual(values=c("#003f5c", "#ffa600")) +
    scale_y_continuous() +
    facet_wrap(~ uniqueness) +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
    labs(color = "Size class",
         fill = "Herbarium size",
         x = "Species status",
         y = "Probability predicted by model")
  
  # Save graph to PNG file
  ggsave(paste("output/Marsico-et-al-2020_predictedFigureProp_", 
               recordedVariables, "_",
               Sys.Date(), 
               ".png", 
               sep = ""),
         width = 8,
         height = 4)
}

# Run function for default analysis variables
graph_analysisResults(analysisSummary_100000_all_FALSE,
                      county_mGLOBAL_100000_all_FALSE,
                      locality_mGLOBAL_100000_all_FALSE,
                      time_mGLOBAL_100000_all_FALSE)

First, here is our observed count of specimen records:

We can represent the same data as a proportion of how likely a given specimen is to fall into each of our uniqueness scales:

And we can also graph the proportions that our models predict for how likely a given specimen is to fall into each of our uniqueness scales:

Re-run with a different size cutoff for small vs. large herbaria

Although we used a cutoff of 100,000 specimens to differentiate between small and large herbaria, emerging research suggests that a more appropriate cutoff might be <175,000 specimens based on the Jenks natural breaks classification method (Cahill, Thiers, and Monfils 2019). We can easily re-run our analyses to see if adjusting the cutoff affects our results…

# Run function `prepare_analysisSpms` with adjusted size cutoff
prepare_analysisSpms(input_specimens, input_herbaria, input_species, 
                                     sizeCutoff = 175000)

# Run function `model_analysisSpms` with adjusted size cutoff
model_analysisSpms(analysisSpms_175000_all_FALSE)

# Run function `graph_analysisResults` with adjusted size cutoff
graph_analysisResults(analysisSummary_175000_all_FALSE,
                      county_mGLOBAL_175000_all_FALSE,
                      locality_mGLOBAL_175000_all_FALSE,
                      time_mGLOBAL_175000_all_FALSE)

totalSpmsSummary_175000_all_FALSE is a summary of how many total specimens are held by the herbaria participating in this project:

sizeClass	totalSpecimens	percentSpecimens
large	11962200	79
small	3204533	21

duplicateSpmsSummary_175000_all_FALSE is a summary of how many specimens in the uniqueSpms dataset represent duplicates in the original data (i.e. the input_specimens):

duplicateType	largeCount	smallCount	largePercent	smallPercent
duplicated by large and small herbaria	283	153	2.87	2.36
duplicated only by large herbaria	1496	NA	15.15	NA
unduplicated	8094	5777	81.98	89.22
duplicated only by small herbaria	NA	545	NA	8.42

analysisSummary_175000_all_FALSE is a summarized version of the analysisSpms data to use for analysis and graphing:

uniqueness	state	status	sizeClass	count	frequency
unique_county	AR	introduced	small	87	1.00
unique_county	AR	native	small	155	1.00
unique_county	AR	S1	small	6	1.00
unique_county	AR	S2	small	23	1.00
unique_county	CA	introduced	large	24	0.67
unique_county	CA	introduced	small	12	0.33
unique_county	CA	native	large	10	0.48
unique_county	CA	native	small	11	0.52
unique_county	CA	S1	large	7	0.64
unique_county	CA	S1	small	4	0.36
unique_county	CA	S2	large	7	0.41
unique_county	CA	S2	small	10	0.59
unique_county	CO	introduced	large	25	0.50
unique_county	CO	introduced	small	25	0.50
unique_county	CO	native	large	32	0.52
unique_county	CO	native	small	30	0.48
unique_county	CO	S1	large	1	0.50
unique_county	CO	S1	small	1	0.50
unique_county	CO	S2	large	2	0.20
unique_county	CO	S2	small	8	0.80
unique_county	FL	introduced	large	51	0.84
unique_county	FL	introduced	small	10	0.16
unique_county	FL	native	large	43	0.86
unique_county	FL	native	small	7	0.14
unique_county	FL	S1	large	8	0.67
unique_county	FL	S1	small	4	0.33
unique_county	FL	S2	large	12	0.52
unique_county	FL	S2	small	11	0.48
unique_county	GA	introduced	large	145	0.58
unique_county	GA	introduced	small	105	0.42
unique_county	GA	native	large	110	0.69
unique_county	GA	native	small	50	0.31
unique_county	GA	S1	large	6	0.67
unique_county	GA	S1	small	3	0.33
unique_county	GA	S2	large	19	0.56
unique_county	GA	S2	small	15	0.44
unique_county	MI	introduced	large	87	0.72
unique_county	MI	introduced	small	33	0.28
unique_county	MI	native	large	107	0.76
unique_county	MI	native	small	33	0.24
unique_county	MI	S1	large	11	0.92
unique_county	MI	S1	small	1	0.08
unique_county	MI	S2	large	33	0.77
unique_county	MI	S2	small	10	0.23
unique_county	TN	introduced	large	130	0.73
unique_county	TN	introduced	small	47	0.27
unique_county	TN	native	large	160	0.83
unique_county	TN	native	small	33	0.17
unique_county	TN	S1	large	8	1.00
unique_county	TN	S2	large	18	0.78
unique_county	TN	S2	small	5	0.22
unique_county	WV	introduced	large	176	0.99
unique_county	WV	introduced	small	2	0.01
unique_county	WV	native	large	67	0.97
unique_county	WV	native	small	2	0.03
unique_county	WV	S1	large	18	0.95
unique_county	WV	S1	small	1	0.05
unique_county	WV	S2	large	38	0.88
unique_county	WV	S2	small	5	0.12
unique_locality	AR	introduced	small	31	1.00
unique_locality	AR	native	small	242	1.00
unique_locality	AR	S1	small	1	1.00
unique_locality	AR	S2	small	16	1.00
unique_locality	CA	introduced	large	294	0.64
unique_locality	CA	introduced	small	168	0.36
unique_locality	CA	native	large	442	0.77
unique_locality	CA	native	small	135	0.23
unique_locality	CA	S1	large	103	0.90
unique_locality	CA	S1	small	12	0.10
unique_locality	CA	S2	large	35	0.81
unique_locality	CA	S2	small	8	0.19
unique_locality	CO	introduced	large	48	0.37
unique_locality	CO	introduced	small	82	0.63
unique_locality	CO	native	large	148	0.44
unique_locality	CO	native	small	191	0.56
unique_locality	CO	S1	large	14	0.40
unique_locality	CO	S1	small	21	0.60
unique_locality	CO	S2	large	35	0.48
unique_locality	CO	S2	small	38	0.52
unique_locality	FL	introduced	large	148	0.82
unique_locality	FL	introduced	small	32	0.18
unique_locality	FL	native	large	202	0.84
unique_locality	FL	native	small	39	0.16
unique_locality	FL	S1	large	26	0.72
unique_locality	FL	S1	small	10	0.28
unique_locality	FL	S2	large	21	0.54
unique_locality	FL	S2	small	18	0.46
unique_locality	GA	introduced	large	47	0.35
unique_locality	GA	introduced	small	88	0.65
unique_locality	GA	native	large	114	0.80
unique_locality	GA	native	small	28	0.20
unique_locality	GA	S1	large	2	0.67
unique_locality	GA	S1	small	1	0.33
unique_locality	GA	S2	large	7	0.47
unique_locality	GA	S2	small	8	0.53
unique_locality	MI	introduced	large	119	0.60
unique_locality	MI	introduced	small	78	0.40
unique_locality	MI	native	large	219	0.66
unique_locality	MI	native	small	114	0.34
unique_locality	MI	S1	large	2	0.67
unique_locality	MI	S1	small	1	0.33
unique_locality	MI	S2	large	20	0.74
unique_locality	MI	S2	small	7	0.26
unique_locality	TN	introduced	large	86	0.58
unique_locality	TN	introduced	small	63	0.42
unique_locality	TN	native	large	191	0.72
unique_locality	TN	native	small	73	0.28
unique_locality	TN	S1	large	5	0.71
unique_locality	TN	S1	small	2	0.29
unique_locality	TN	S2	large	20	0.83
unique_locality	TN	S2	small	4	0.17
unique_locality	WV	introduced	large	97	0.98
unique_locality	WV	introduced	small	2	0.02
unique_locality	WV	native	large	20	0.77
unique_locality	WV	native	small	6	0.23
unique_locality	WV	S1	large	7	1.00
unique_locality	WV	S2	large	36	0.82
unique_locality	WV	S2	small	8	0.18
unique_time	AR	introduced	small	59	1.00
unique_time	AR	native	small	266	1.00
unique_time	AR	S1	small	4	1.00
unique_time	AR	S2	small	40	1.00
unique_time	CA	introduced	large	425	0.64
unique_time	CA	introduced	small	236	0.36
unique_time	CA	native	large	609	0.79
unique_time	CA	native	small	158	0.21
unique_time	CA	S1	large	168	0.79
unique_time	CA	S1	small	44	0.21
unique_time	CA	S2	large	133	0.76
unique_time	CA	S2	small	41	0.24
unique_time	CO	introduced	large	82	0.30
unique_time	CO	introduced	small	192	0.70
unique_time	CO	native	large	192	0.32
unique_time	CO	native	small	400	0.68
unique_time	CO	S1	large	27	0.36
unique_time	CO	S1	small	48	0.64
unique_time	CO	S2	large	43	0.35
unique_time	CO	S2	small	79	0.65
unique_time	FL	introduced	large	345	0.67
unique_time	FL	introduced	small	168	0.33
unique_time	FL	native	large	344	0.74
unique_time	FL	native	small	122	0.26
unique_time	FL	S1	large	105	0.60
unique_time	FL	S1	small	71	0.40
unique_time	FL	S2	large	174	0.74
unique_time	FL	S2	small	60	0.26
unique_time	GA	introduced	large	124	0.34
unique_time	GA	introduced	small	239	0.66
unique_time	GA	native	large	151	0.65
unique_time	GA	native	small	80	0.35
unique_time	GA	S1	large	9	0.64
unique_time	GA	S1	small	5	0.36
unique_time	GA	S2	large	18	0.82
unique_time	GA	S2	small	4	0.18
unique_time	MI	introduced	large	408	0.61
unique_time	MI	introduced	small	262	0.39
unique_time	MI	native	large	872	0.68
unique_time	MI	native	small	418	0.32
unique_time	MI	S1	large	41	0.71
unique_time	MI	S1	small	17	0.29
unique_time	MI	S2	large	99	0.80
unique_time	MI	S2	small	24	0.20
unique_time	TN	introduced	large	254	0.42
unique_time	TN	introduced	small	347	0.58
unique_time	TN	native	large	505	0.59
unique_time	TN	native	small	344	0.41
unique_time	TN	S1	large	42	0.40
unique_time	TN	S1	small	64	0.60
unique_time	TN	S2	large	65	0.52
unique_time	TN	S2	small	60	0.48
unique_time	WV	introduced	large	214	0.74
unique_time	WV	introduced	small	74	0.26
unique_time	WV	native	large	71	0.65
unique_time	WV	native	small	39	0.35
unique_time	WV	S1	large	35	0.92
unique_time	WV	S1	small	3	0.08
unique_time	WV	S2	large	80	0.71
unique_time	WV	S2	small	32	0.29