textOut.txt

D:/cwrNA/src/dataPrep/addNorthAmericanCounts.r 

###
# add the number of points that occur in modeling area to counts CSV and rewrite
# dan.carver@carverd.com
# 20200414
###

addNorthAmericanCounts <- function(species){
      # read in data filtered to north america
      df1 <- spPoint@data %>%
        dplyr::group_by(type) %>%
        dplyr::summarise(count = n())
      # if both g and h occurrences are present
      if(nrow(df1) == 2){
        gs <- df1 %>% filter(type=="G")
        gs <- gs$count[1]
        hs <- df1 %>% filter(type=="H")
        hs <- hs$count[1]
      }else{
        # if only one or the other occurrences types area present
        if(df1$type == "G"){
          gs <- df1 %>% filter(type=="G")
          gs <- gs$count[1]
          hs <- 0
        }else{
          gs <- 0
          hs <- df1 %>% filter(type=="H")
          hs <- hs$count[1]
        }
      }
      # read in existing counts data
      df <- read.csv(paste0(sp_dir,"/counts.csv"))
      # assign values
      df$NA_occurrences <- nrow(spPoint)
      df$NA_GUseful <- gs
      df$NA_HUseful <- hs
  # write out content.
  write.csv(x = df, file = paste0(sp_dir,"/counts.csv"), row.names = FALSE)
}
D:/cwrNA/src/dataPrep/countryCheck.R 

###
# subset all point locations that do not fall within a country of interest
# and generate the first version of clean data which is used in the modeling
# process
# 20190827
# carver.dan1@gmail.com
###

countryCheck <- function(species){
  ###
  # need to join between spatial points and spatial polygon dataframe to get country information
  #https://www.rdocumentation.org/packages/sp/versions/1.3-1/topics/over-methods

  # create spatial point object and set CRS
  # set crs to the same
  xyData <<- sp::SpatialPoints(spPoint@coords)
  crs(xyData) <- crs(countrySHP)

  # overlay with country shp
  countryVal <- data.frame(over(x = xyData, y = countrySHP)) %>%
    dplyr::select(ISO_A3)

  # add data back to spPoints and drop all columns that have no ISO3
  spPoint@data$iso3_check <- countryVal$ISO_A3
  onLand<- complete.cases(spPoint@data$iso3_check) #https://stackoverflow.com/questions/21567028/extracting-points-with-polygon-in-r
  spPoint <- spPoint[onLand,]

  # pull in states by taxon data and filter it to genus, species
  sData <- statesData %>%
    dplyr::filter(name == species)%>%
    dplyr::distinct(State) %>%
    dplyr::filter(State != "")
  # clause for species with no state specific data in GRIN
  if(nrow(sData) == 0){
    cleanPoints <<- spPoint
  }else{
      #seperate out points that fall inside Mex, Can, and Usa
      spPoint$StateTest <- spPoint@data$iso3_check %in% c("MEX", "USA", "CAN")
      # we only test states for mex,usa,can. So two groups are generated here.
      mucPoints <- spPoint[spPoint$StateTest == TRUE,]
      nonMucPoints <- spPoint[spPoint$StateTest == FALSE,]

      # filter the sp admin
      if(nrow(mucPoints) > 0){
        # States SP Object is a large spatial polygon feature loaded in run lineal
        sSp <- statesSpObject[statesSpObject$NAME_1 %in% sData$State,]
        #overlay points onto filter states data and remove any points that do not fall within know states
          statePoints <- as.data.frame(over(x = mucPoints, y = sSp))%>%
            dplyr::select(NAME_1)

          # add data back to spPoints and drop all columns that have no ISO3
          mucPoints <- mucPoints[!is.na(statePoints$NAME_1),]
          t2 <- nrow(mucPoints)
        #clause for when states were found to contain occurrences
        if(t2 != 0){
          spPoint <- rbind(mucPoints,nonMucPoints )
          # filter Duplicates
          uniqueP <- distinct(spPoint@data)
          coords <- cbind(uniqueP$longitude, uniqueP$latitude)
          # base data used in the modeling process.
          cleanPoints <<- sp::SpatialPointsDataFrame(coords = coords, data = uniqueP, proj4string = crs(spPoint))

        }else{
          # if no occurrence data is found within the selected states this
          # this process defaults back to include all data.
          cleanPoints <<- spPoint
        }
      }else{
        # if no occurrence are found in can,usa,mex the all points are kept 
      spPoint <- nonMucPoints
      cleanPoints <<- spPoint
    }
  }
}
D:/cwrNA/src/dataPrep/create_sp_dirs.R 

###
# Test for existing file stucture and generates sturcture if needed.
# dan.carver@carverd.com
# 20200414
###

create_sp_dirs <- function(species) {
  #create species dir
  sp_dir <<- paste0(gap_dir,"/",genus, "/",species,"/",run_version)
  if (!file.exists(sp_dir)) {dir.create(sp_dir,recursive=T)}

  #create other directories
  #if (!file.exists(paste(sp_dir,"/bioclim",sep=""))) {dir.create(paste(sp_dir,"/bioclim",sep=""))}
  if (!file.exists(paste(sp_dir,"/gap_analysis/combined",sep=""))) {dir.create(paste(sp_dir,"/gap_analysis/combined",sep=""),recursive=T)}
  if (!file.exists(paste(sp_dir,"/gap_analysis/exsitu",sep=""))) {dir.create(paste(sp_dir,"/gap_analysis/exsitu",sep=""),recursive=T)}
  if (!file.exists(paste(sp_dir,"/gap_analysis/insitu",sep=""))) {dir.create(paste(sp_dir,"/gap_analysis/insitu",sep=""),recursive=T)}
  if (!file.exists(paste(sp_dir,"/gap_analysis/redList",sep=""))) {dir.create(paste(sp_dir,"/gap_analysis/redList",sep=""),recursive=T)}
  if (!file.exists(paste(sp_dir,"/modeling/alternatives",sep=""))) {dir.create(paste(sp_dir,"/modeling/alternatives",sep=""),recursive=T)}
  if (!file.exists(paste(sp_dir,"/modeling/maxent",sep=""))) {dir.create(paste(sp_dir,"/modeling/maxent",sep=""),recursive=T)}
  if (!file.exists(paste(sp_dir,"/modeling/nativeArea",sep=""))) {dir.create(paste(sp_dir,"/modeling/nativeArea",sep=""),recursive=T)}
  if (!file.exists(paste(sp_dir,"/modeling/replicates",sep=""))) {dir.create(paste(sp_dir,"/modeling/replicates",sep=""),recursive=T)}
  if (!file.exists(paste(sp_dir,"/occurrences",sep=""))) {dir.create(paste(sp_dir,"/occurrences",sep=""),recursive=T)}

  #return
  return(species)
}
D:/cwrNA/src/dataPrep/dataBaseTransform/botanicalGardenTransform.R 

###
# reworking of botanical garden data
# 20190815
# carver.dan1@gmail.com
### 

library(tidyverse)
library(data.table)
library(naniar)
library(reshape2)

#set base dir
base_dir <- "D:/cwrNA/occurrence_data2019_05_29/botanicalGarden"

# Load in data 
csvPath <- paste0(base_dir,"/P1ABC GRIN PlantSearch match ALL.csv")
data <- data.table::fread(csvPath, header = TRUE)
# Select necessary columns from dataset 
dataThin <- data %>%
  select("Institution",  "CollectionType", 
         "ParentFULLTaxon_GRIN Global_2019 final", "ParentGenus",
         "ParentSpecies","ParentInfraRank", "ParentInfraEpi",
         "ParentInfraRank2", "ParentInfraEpi2")
nr <- nrow(dataThin)
# construct species from species plus 4 interspecific columns 
dataThin <- dataThin %>% naniar::replace_with_na(replace = list(ParentSpecies="",ParentInfraRank ="",
                                                                ParentInfraEpi="", ParentInfraRank2="",
                                                                ParentInfraEpi2="" ))


# function for testing for before concatinating species. 
f = function(x){
  if(!is.na(x[,7])){
    x[,5] <- paste(x[,5], " ", x[,6], " ", x[,7])}
  else{}
  if(!is.na(x[,9])){
    x[,5] <- paste(x[,5], " ", x[,6], " ", x[,7], " ", x[,8], " ", x[,9])
  }
  return(x)
}


t2 <- data.frame()
for(i in 1:nr){
  t3 <- f(dataThin[i])
  t2 <- rbind(t2,t3)
}


# define structure of the empty dataframe 
df <- data.frame(taxon=character(nr),
                 genus=character(nr),
                 species=character(nr),
                 latitude=double(nr),
                 longitude=double(nr),
                 databaseSource=character(nr),
                 institutionCode=character(nr),
                 type=factor(nr),
                 uniqueID=factor(nr),
                 sampleCategory=character(nr),
                 country=character(nr),
                 iso3=character(nr),
                 localityInformation=character(nr),
                 biologicalStatus = character(nr), 
                 collectionSource = character(nr),
                 finalOriginStat = character(nr),
                 stringsAsFactors=FALSE)

# assign columns to location in empty dataframe
df$taxon <- t2$`ParentFULLTaxon_GRIN Global_2019 final`
df$genus <- t2$ParentGenus
df$species <- t2$ParentSpecies 
df$latitude <- NA
df$longitude <- NA
df$databaseSource <- "BotanicialGarden"
df$institutionCode <- t2$Institution
df$type <- "G"
df$uniqueID <- NA  
df$sampleCategory <- t2$CollectionType
df$country <- NA
df$iso3 <- NA
df$localityInformation <- NA
df$biologicalStatus <- NA
df$collectionSource <- NA
df$finalOriginStat <- NA 

# pull in checkSynomyn function and apply it 
source(file="D:/cwrNA/src/dataPrep/dataBaseTransform/checkSynonymsFunction.R")
df2 <- checkSynonym(df)

# generate a list of missmatch lat long values 
testLatLong <<- df2 %>%
  dplyr::select(c("uniqueID","latitude", "longitude")) %>%
  mutate(hasLat = !is.na(latitude) & latitude != "\\N" & latitude != "") %>%
  mutate(hasLong = !is.na(longitude) & longitude != "\\N"& longitude != "") %>%
  mutate(hasLatLong = hasLat & hasLong)

# pull missmatched lat long values 
summariseErrors <- testLatLong %>%
  filter(hasLat == TRUE & hasLong ==FALSE | hasLat == FALSE & hasLong ==TRUE)
# write out summary 
print(paste0("there are ", nrow(summariseErrors)," miss matach lat long pairs."))
write.csv(x = summariseErrors, file = paste0(base_dir,"/mismatchLatLong.csv"))

# write out the new dataframe of refined data 
write.csv(x = df2, file = paste0(base_dir,"/refinedBotanicalGarden.csv"))

D:/cwrNA/src/dataPrep/dataBaseTransform/capsicumTransform.R 

###
# reworking of capsicum data
# 20191120
# carver.dan1@gmail.com
### 

library(tidyverse)
library(data.table)
library(MazamaSpatialUtils)

#set base dir
base_dir <- "D:/cwrNA/occurrence_data2019_05_29/capsicum"

# Load in data 
csvPath <- paste0(base_dir,"/Khoury_Capsicum_paperdata_USAspecies20191206.csv")
data <- data.table::fread(input = csvPath,header = TRUE)
# Select necessary columns from dataset 

dataThin <- data %>%
  dplyr::select("taxon","latitude","longitude", "db","institutioncode",
         "type", "record_identifier","sampstat","collsrc", "country", 
         "iso2", "adm1", "adm2", "adm3", "adm4", "locality")
nr <- nrow(dataThin)


#replace iso 2 with iso 3 
dataThin$iso2 <- gsub(pattern = "",replacement = NA,x = dataThin$iso2)
dataThin$iso3 <- MazamaSpatialUtils::iso2ToIso3(dataThin$iso2)

#define locality information 
dataThin$local2 <- paste(dataThin$adm1,dataThin$adm2,dataThin$adm3,
                         dataThin$adm4, dataThin$locality,
                         sep=" -- ")

# define structure of the empty dataframe 
df <- data.frame(taxon=character(nr),
                 genus=character(nr),
                 species=character(nr),
                 latitude=double(nr),
                 longitude=double(nr),
                 databaseSource=character(nr),
                 institutionCode=character(nr),
                 type=factor(nr),
                 uniqueID=factor(nr),
                 sampleCategory=character(nr),
                 country=character(nr),
                 iso3=character(nr),
                 localityInformation=character(nr),
                 biologicalStatus = character(nr), 
                 collectionSource = character(nr),
                 finalOriginStat = character(nr),
                 stringsAsFactors=FALSE)

# assign columns to location in empty dataframe
df$taxon <- dataThin$taxon
df$genus <- "Capsicum"
df$species <- "annuum var. glabriusculum"
df$latitude <- dataThin$latitude
df$longitude <- dataThin$longitude
df$databaseSource <- "Capsicum" 
df$institutionCode <- dataThin$institutioncode
df$type <- dataThin$type
df$uniqueID <- dataThin$record_identifier
df$sampleCategory <- dataThin$sampstat
df$country <- NA
df$iso3 <- dataThin$iso3
df$localityInformation <- dataThin$local2 
df$biologicalStatus <- dataThin$collsrc
df$collectionSource <- NA
df$finalOriginStat <- NA 

# split taxon to Genus and species 
df$taxon <- gsub(pattern = "_",replacement = " ",x = df$taxon)


# write out CSV 
write.csv(x= df, paste0(base_dir, "/refinedCapsicum.csv") )

D:/cwrNA/src/dataPrep/dataBaseTransform/checkSynonyms.R 

####
# thin the combined data so it contains only known species of interest. 
# once select, I want to rename all the synonyms to match a single taxon 
# The output dataset will be something that can be used for modeling 
# carver.dan1@gmail.com 
# 20190822
###


library(tidyverse)
library(data.table)
data <- data.table::fread("D:/cwrOfNA/occurrence_data2019_05_29/combinedOccurance2019-08-29.csv", header = TRUE)

base_dir <- "D:/cwrOfNA"

# read in data and synonym list 
#data <- data.table::fread(file = paste0(base_dir, "/occurrence_data2019_05_29/combinedOccurance2019-08-29.csv"), header = TRUE)

syn <- data.table::fread(file = paste0(base_dir, "/speciesList/CWRoftheUSA_synonyms.csv") ,header = TRUE)

# so the taxon/synonym sheet do not have"_" between values. I need to replace this with space 
data$taxon <-gsub("_", " ", data$taxon, fixed=TRUE)

# pull data that is from the true data list. 
primary <- data[data$taxon %in% unique(syn$`Taxon_GRIN Global_2019 final`),]

# create a list of unique synonyms
synList <- syn[which(syn$synonym != ""),]
synList <- unique(synList$synonym)

#Select all data where the taxon will need to be changed. 
secondary <- data[data$taxon %in% synList,] 


syn2 <- syn[which(syn$synonym != ""),]

# Issues with join, some species have the same synonym for mutliple taxon. There is no real way of knowing which is which
# So i think the best option is either droping those points or duplicating them? One can tell them apart by elements which 
# have duplicated V1 values 
### example of the join issue 
syn2$syn1 <- syn2$synonym
join1 <- dplyr::left_join(x = secondary[1800:1815,], y = syn2 ,by = c("taxon" = "synonym"), keep=TRUE) 
### 


syn2$syn1 <- syn2$synonym
join2 <- dplyr::left_join(x = secondary, y = syn2 ,by = c("taxon" = "synonym"), keep=TRUE) 

# if synonym in not NA, replace taxon with Taxon GRIN Global 
join2$taxon <- join2$`Taxon_GRIN Global_2019 final`
# add step to replace genus... not sure how 

### duplicate issues 
issues <- join2 %>% 
  group_by(V1)%>%
  count()%>%
  filter(n != 1)

iss2 <- secondary[secondary$V1 %in% issues$V1, ] 
speciecsOfIssue <- unique(iss2$taxon)
### 


df1 <- dplyr::select(.data = join2,-c("Taxon_GRIN Global_2019 final","taxonomy_species_id","name","note","syn1"))
df2 <- rbind(primary, df1)

# write out the modeling data 
write.csv(x = df2, file = paste0(base_dir,"/modelingData", Sys.Date(), ".csv"))
D:/cwrNA/src/dataPrep/dataBaseTransform/checkSynonymsFunction.R 

####
# thin the combined data so it contains only known species of interest. 
# once select, I want to rename all the synonyms to match a single taxon 
# The output dataset will be something that can be used for modeling 
# carver.dan1@gmail.com 
# 20190822
###


checkSynonym <- function(data){
  l1 <- ncol(data)
  d1 <- read.csv("D:/cwrNA/speciesList/CWRoftheUSA_synonyms20191114.csv", header = TRUE)
  syn <- d1 %>% filter(note == "synonym" )
  alt <- d1 %>% filter(note == "alt name")
  acp <- d1 %>% filter(note == "accepted")
  
  # test for alt names and replace them with synonym 
    # join and replace 
  alt$alt.syn <- as.character(alt$alt.syn)
  alt$synonym <- as.character(alt$synonym)
  
  dataJoin <- dplyr::left_join(data, alt, by = c("taxon" = "alt.syn" ))
  n = 0 
  for(i in 1:nrow(dataJoin)){
    if(!is.na(dataJoin$synonym[i])){
      dataJoin$taxon[i] <- dataJoin$synonym[i]
      n = n+1
    }
  }
  print(paste0(n, " number of occurences had an alt name that were replaced with a synonym"))
  
  # drop columns from the join
  dataJoin <- dataJoin[1:l1]
  
  # test for synonym replace them with accepted names 
    # join and replace
  syn$synonym <- as.character(syn$synonym) 
  syn$Taxon_GRIN.Global_2019.final <- as.character(syn$Taxon_GRIN.Global_2019.final)
  
  dataJoin2 <- dplyr::left_join(data, syn, by = c("taxon" = "synonym" ))
  n = 0 
  for(i in 1:nrow(dataJoin2)){
    if(!is.na(dataJoin2$name[i])){
      dataJoin2$taxon[i] <- dataJoin2$Taxon_GRIN.Global_2019.final[i]
      n = n+1
    }
  }
  print(paste0(n, " number of occurences had an alt name that were replaced with a synonym"))
  
  # drop columns from the join
  dataJoin2 <- dataJoin2[1:l1]
  
  # select all rows where taxon == an acceped name 
    #FILTER 
  primary <- dataJoin2[dataJoin2$taxon %in% unique(acp$Taxon_GRIN.Global_2019.final),]
  print(paste0("there were ", nrow(data)-nrow(primary) ," removed for not being accecpt crop wild relative species"))
  
  return(primary) 
}
D:/cwrNA/src/dataPrep/dataBaseTransform/combineAllDatasets.R 

###
# The goal of this script is to compile all the csv from individual sources into
# a single element. 
# Not sure if this is really the beset structure to work with but I'm going to try
# it and see what happens 
### 

library(tidyverse)
library(data.table)


base_dir <- "D:/cwrNA/occurrence_data2019_05_29"

# read in all csvs 
files <- list.files(path = base_dir, pattern = ".csv",full.names = TRUE,recursive = TRUE)
# select all those that have 'refined' in the name 
refined <- files[grepl(pattern = 'refined', x = files)]
# drop idigbio for now 
refined <- refined[-grep(pattern = "refinedIdigBio", x = refined)]

# create an empty df with same structure 
df1 <- data.frame(taxon=character(),
                 genus=character(),
                 species=character(),
                 latitude=double(),
                 longitude=double(),
                 databaseSource=character(),
                 institutionCode=character(),
                 type=factor(),
                 uniqueID=factor(),
                 sampleCategory=character(),
                 country=character(),
                 iso3=character(),
                 localityInformation=character(),
                 biologicalStatus = character(), 
                 collectionSource = character(),
                 finalOriginStat = character(),
                 stringsAsFactors=FALSE)

# test for duplicates with GBIF data 


# test for invalid lat long 
# troubleshooting --- removing gbif from refined list 
gbif <- fread(refined[grep(pattern = "refinedGBIF", x = refined)], header = TRUE)
gbif$uniqueID <- as.factor(gbif$uniqueID)
gbif <- gbif %>%dplyr::select("taxon","genus","species","latitude","longitude",
                              "databaseSource","institutionCode","type","uniqueID",
                              "sampleCategory","country","iso3","localityInformation", 
                              "biologicalStatus", "collectionSource","finalOriginStat")

#pull capsicum and cucurbita out,   
capsicum <- fread(refined[grep(pattern = "refinedCapsicum", x = refined)], header = TRUE)%>%
  dplyr::select("taxon","genus","species","latitude","longitude",
                "databaseSource","institutionCode","type","uniqueID",
                "sampleCategory","country","iso3","localityInformation", 
                "biologicalStatus", "collectionSource","finalOriginStat")
cucurbita <- fread(refined[grep(pattern = "refinedCucurbita", x = refined)], header = TRUE)%>%
  dplyr::select("taxon","genus","species","latitude","longitude",
                "databaseSource","institutionCode","type","uniqueID",
                "sampleCategory","country","iso3","localityInformation", 
                "biologicalStatus", "collectionSource","finalOriginStat")


refined <- refined[-grep(pattern = "refinedGBIF", x = refined)]
refined <- refined[-grep(pattern = "refinedCapsicum", x = refined)]
refined <- refined[-grep(pattern = "refinedCucurbita", x = refined)]


# create function 
# read in one csv, r bind it to df, drop csv from memory 
appendTable <- function(dataframe, pathToData){
  data <- data.table::fread(pathToData, header = TRUE,)%>%
    dplyr::select("taxon","genus","species","latitude","longitude",
                 "databaseSource","institutionCode","type","uniqueID",
                 "sampleCategory","country","iso3","localityInformation", 
                 "biologicalStatus", "collectionSource","finalOriginStat")
  dataframe <- rbind(dataframe, data)
  rm(data)
  return(dataframe)
}

# apply function to list of csvs 
for(i in 1:length(refined)){
  path <- refined[i]
  df1 <- appendTable(dataframe = df1, pathToData = path)
  print(paste0(path, " has been added"))
  print(dim(df1))
}
# add gbif
df1 <- rbind(df1, gbif)

# remove all cucurbita and capsicum rows.
df1 <- df1[df1$genus != "Capsicum",]
df1 <- df1[df1$genus != "Cucurbita",]


# add clean capsicum and cucurbita 
df1 <- rbind(df1, cucurbita,capsicum)


# read in zizania texana data 
zT <- read.csv("D:/cwrNA/occurrence_data2019_05_29/zizania/Zizania_texana.csv")
# create an empty df with same structure 
nr <- nrow(zT)
df2 <- data.frame(taxon=character(nr),
                  genus=character(nr),
                  species=character(nr),
                  latitude=double(nr),
                  longitude=double(nr),
                  databaseSource=character(nr),
                  institutionCode=character(nr),
                  type=factor(nr),
                  uniqueID=factor(nr),
                  sampleCategory=character(nr),
                  country=character(nr),
                  iso3=character(nr),
                  localityInformation=character(nr),
                  biologicalStatus = character(nr), 
                  collectionSource = character(nr),
                  finalOriginStat = character(nr),
                  stringsAsFactors=FALSE)
# compile zazania texana to match the sctructure 
df2$taxon <- "Zizania texana"
df2$genus <- "Zizania"
df2$species <- "texana"
df2$latitude <- zT$lat
df2$longitude <- zT$lon
df2$databaseSource <- zT$Source
df2$uniqueID <- zT$id
df2$type <- zT$Type

#add the addational rice data to the whole set. 
df1 <- rbind(df1, df2)


# impliment taxonomic changes from Colins removal of species 20200226 

# remove all species that are not being using in the CWR species list for this study from the occurrence data 
sRemove <- c("Capsicum annuum", "Cucurbita okeechobeensis","Cucurbita pepo","Cucurbita pepo var. ozarkana",
"Cucurbita pepo var. texana","Persea borbonia var. borbonia","Ribes cereum var. inebrians") 

#none of these species have records so, deleting them doesn't change much, still it is good for record keeping
# they were removed from the cwr_NA list csv as well. 
df1 <- df1[!df1$taxon %in% sRemove,]

#convert Persea borbonia var. pubescens to Persea palustris 
## pull all records
t1 <- df1 %>% filter(taxon == "Persea borbonia var. pubescens")

## convert taxon and species colums 
t1$taxon <- "Persea palustris"
t1$species <- "palustris"

## remove all records from df1 
df2 <- df1 %>% filter(taxon != "Persea borbonia var. pubescens")

## add new records to df1 
df1 <- rbind(df2, t1)

# Convert specific species to intraspecific 
## convert Vaccinium ovalifolium to Vaccinium ovalifolium var. ovalifolium
## pull all records
t1 <- df1 %>% filter(taxon == "Vaccinium ovalifolium")
## convert taxon and species colums 
t1$taxon <- "Vaccinium ovalifolium var. ovalifolium"
t1$species <- "ovalifolium var. ovalifolium"
## remove all records from df1 
df2 <- df1 %>% filter(taxon != "Vaccinium ovalifolium")
## add new records to df1 
df1 <- rbind(df2, t1)

## convert Vaccinium erythrocarpum to Vaccinium erythrocarpum subsp. erythrocarpum
## pull all records
t1 <- df1 %>% filter(taxon == "Vaccinium erythrocarpum")
## convert taxon and species colums 
t1$taxon <- "Vaccinium erythrocarpum subsp. erythrocarpum"
t1$species <- "erythrocarpum subsp. erythrocarpum"
## remove all records from df1 
df2 <- df1 %>% filter(taxon != "Vaccinium erythrocarpum")
## add new records to df1 
df1 <- rbind(df2, t1)


## convert Allium schoenoprasum to Allium schoenoprasum subsp. schoenoprasum
## pull all records
t1 <- df1 %>% filter(taxon == "Allium schoenoprasum")
## convert taxon and species colums 
t1$taxon <- "Allium schoenoprasum subsp. schoenoprasum"
t1$species <- "schoenoprasum subsp. schoenoprasum"
## remove all records from df1 
df2 <- df1 %>% filter(taxon != "Allium schoenoprasum")
## add new records to df1 
df1 <- rbind(df2, t1)


## Remove all species that could have been change and replace with edits from the spreadsheet
spList <- c("Leymus salina","Leymus salina subsp. mojavensis","Leymus salina subsp. salina",
            "Leymus salina subsp. salmonis","Persea borbonia","Persea borbonia var. borbonia",
            "Persea borbonia var. pubescens","Saccharum brevibarbe","Saccharum brevibarbe var. brevibarbe",
            "Saccharum brevibarbe var. contortum","Vaccinium crassifolium","Vaccinium crassifolium subsp. crassifolium" ,
            "Vaccinium crassifolium subsp. sempervirens",'Elymus glabriflorus','Elymus glabriflorus var. australis',
            'Elymus glabriflorus var. glabriflorus','Elymus glaucus','Elymus glaucus subsp. glaucus',
            'Elymus glaucus subsp. mackenziei','Elymus glaucus subsp. virescens','Ipomoea cordatotriloba',
            'Ipomoea cordatotriloba var. cordatotriloba','Ipomoea cordatotriloba var. torreyana',
            'Juglans major','Juglans major var. major','Juglans microcarpa','Juglans microcarpa var. microcarpa',
            'Leymus mollis','Leymus mollis subsp. mollis','Leymus mollis subsp. villosissimus',
            'Phaseolus leptostachyus','Phaseolus leptostachyus var. leptostachyus','Prunus virginiana',
            'Prunus virginiana var. demissa','Prunus virginiana var. virginiana','Ribes cereum',
            'Ribes cereum var. cereum','Ribes cereum var. colubrinum','Rubus ursinus',
            'Rubus ursinus subsp. macropetalus','Rubus ursinus subsp. ursinus','Tripsacum dactyloides',
            'Tripsacum dactyloides var. dactyloides'
)
## select all species that are not in the list 
t1 <- df1 %>% filter(!taxon %in% spList)
## read in and join altered taxon lists 
t2 <- read.csv("D:/cwrNA/parameters/USA_cropWildRelativeInventory/intraspecificAlterations/intraSpecificSpecsNewTaxonEdits.csv")
t3 <- read.csv("D:/cwrNA/parameters/USA_cropWildRelativeInventory/intraspecificAlterations/intraSpecificSpecsList2NewTaxonEdits.csv")
t4 <- rbind(t2, t3)


###**there is 11 more occurrence in this re combined dataset... I don't know exactly why that is 
# I'm rolling with it for now 
df2 <- rbind(t1, t4)

# redefine genus and species to account for the duplication of values 
df3 <- df2 %>%
  dplyr::mutate(t0 = taxon)%>%
  tidyr::separate(col = t0,into = c("t1","t2", "t3","t4","t5","t6"),
                  sep = " ")
df3$genus <- df3$t1
df4 <- df3[,18:22]
# was assigning this to a column in a second dataframe and it was causeing quite a few issues. 
# keep it seperate seemed to fix the issues. 
df4 <- df4 %>% tidyr::unite("z", na.rm = TRUE , sep = " ")


df3$species <- df4$z

df2a <- df3[,1:16]
# colnames(df2) <- c("taxon","genus" ,             
# "species","latitude",
# "longitude","databaseSource","institutionCode","type","uniqueID","sampleCategory",
# "country","iso3", "localityInformation", "biologicalStatus","collectionSource",
# "finalOriginStat")
# write out final csv 
write.csv(x = df2a, file = paste0(base_dir,"/combinedOccurance", Sys.Date(), ".csv"))

D:/cwrNA/src/dataPrep/dataBaseTransform/cucurbitaTransform.R 

###
# reworking of cucurbita data
# 20191120
# carver.dan1@gmail.com
### 

library(tidyverse)
library(data.table)

#set base dir
base_dir <- "D:/cwrNA/occurrence_data2019_05_29/cucurbita"

# Load in data 
csvPath <- paste0(base_dir,"/Khoury_Cucurbita_paperdata_USAspecies20191206.csv")
data <- data.table::fread(input = csvPath,header = TRUE)
# Select necessary columns from dataset 

dataThin <- data 
nr <- nrow(dataThin)

# define structure of the empty dataframe 
df <- data.frame(taxon=character(nr),
                 genus=character(nr),
                 species=character(nr),
                 latitude=double(nr),
                 longitude=double(nr),
                 databaseSource=character(nr),
                 institutionCode=character(nr),
                 type=factor(nr),
                 uniqueID=factor(nr),
                 sampleCategory=character(nr),
                 country=character(nr),
                 iso3=character(nr),
                 localityInformation=character(nr),
                 biologicalStatus = character(nr), 
                 collectionSource = character(nr),
                 finalOriginStat = character(nr),
                 stringsAsFactors=FALSE)

# assign columns to location in empty dataframe
df$taxon <- dataThin$taxon
df$genus <- "Cucurbita"
df$species <- NA
df$latitude <- dataThin$latitude
df$longitude <- dataThin$longitude
df$databaseSource <- "cucurbita" 
df$institutionCode <- dataThin$institute
df$type <- dataThin$type
df$uniqueID <- dataThin$sample_number
df$sampleCategory <- dataThin$status
df$country <- dataThin$country
df$iso3 <- NA
df$localityInformation <- dataThin$locality
df$biologicalStatus <- NA
df$collectionSource <- NA
df$finalOriginStat <- NA 

# split taxon into genus and species 
df$taxon <- gsub(pattern = "_", replacement = " ", x = df$taxon)

df$species <- gsub(pattern = "Cucurbita", replacement = "", x = df$taxon)

#write CSV 
write.csv(x = df, file = paste0(base_dir, "/refinedCucurbita.csv"))
D:/cwrNA/src/dataPrep/dataBaseTransform/cwrOfNorthAmericaTransfom.R 

###
# reworking of botanical garden data
# 20190815
# carver.dan1@gmail.com
### 

library(tidyverse)
library(data.table)

#set base dir
base_dir <- "D:/cwrNA/occurrence_data2019_05_29/cwrofnorthamericabook"

# Load in data 
csvPath <- paste0(base_dir,"/CWRofNAmerica_dataextras.csv")
data <- data.table::fread(csvPath, header = TRUE)
# Select necessary columns from dataset 
dataThin <- data %>%
  select("id", "Taxon","Type","Source",
         "lat", "lon", "final_cult_stat")
nr <- nrow(dataThin)

# define structure of the empty dataframe 
df <- data.frame(taxon=character(nr),
                 genus=character(nr),
                 species=character(nr),
                 latitude=double(nr),
                 longitude=double(nr),
                 databaseSource=character(nr),
                 institutionCode=character(nr),
                 type=factor(nr),
                 uniqueID=factor(nr),
                 sampleCategory=character(nr),
                 country=character(nr),
                 iso3=character(nr),
                 localityInformation=character(nr),
                 biologicalStatus = character(nr), 
                 collectionSource = character(nr),
                 finalOriginStat = character(nr),
                 stringsAsFactors=FALSE)

# assign columns to location in empty dataframe
df$taxon <- dataThin$Taxon
df$genus <- NA
df$species <- NA
df$latitude <- dataThin$lat
df$longitude <- dataThin$lon
df$databaseSource <- "cwrofnorthamericabook"
df$institutionCode <- dataThin$Source
df$type <- dataThin$Type
df$uniqueID <- dataThin$id  
df$sampleCategory <- NA
df$country <- NA
df$iso3 <- NA
df$localityInformation <- NA
df$biologicalStatus <- dataThin$final_cult_stat
df$collectionSource <- NA
df$finalOriginStat <- NA 

# determine Genus and species from taxon 
test1 <- tidyr::separate(data = df,taxon,into=c("genus","species","sep1","var1"),sep="_")
#create new column to populate 
test1$fullSpecies <- NA 
for(i in 1:nrow(test1)){
  if(is.na(test1$var1[i])){
    test1$fullSpecies[i] <- test1$species[i]
  }
  if(!is.na(test1$var1[i])){
    test1$fullSpecies[i] <- paste(test1$species[i],test1$sep1[i],test1$var1[i], sep="_")
  }
}
#set genus and species in final DF 
df$genus <- test1$genus
df$species <- test1$fullSpecies

# replace _ with " " on taxon 
df$taxon <- gsub(pattern = "_", replacement = " ",x = df$taxon)


# pull in checkSynomyn function and apply it 
source(file="D:/cwrNA/src/dataPrep/dataBaseTransform/checkSynonymsFunction.R")
df <- checkSynonym(df)


# actual code 
testLatLong <<- df %>%
  dplyr::select(c("uniqueID","latitude", "longitude")) %>%
  mutate(hasLat = !is.na(latitude) & latitude != "\\N" & latitude != "") %>%
  mutate(hasLong = !is.na(longitude) & longitude != "\\N"& longitude != "") %>%
  mutate(hasLatLong = hasLat & hasLong)

summariseErrors <- testLatLong %>%
  filter(hasLat == TRUE & hasLong ==FALSE | hasLat == FALSE & hasLong ==TRUE)

print(paste0("there are ", nrow(summariseErrors)," miss matach lat long pairs."))
write.csv(x = summariseErrors, file = paste0(base_dir,"/mismatchLatLong.csv"))


# write out the new dataframe 
write.csv(x = df, file = paste0(base_dir,"/refinedcwrOfNABook.csv"))

D:/cwrNA/src/dataPrep/dataBaseTransform/cwrTransform.R 

###
# reworking of wiews data
# 20190815
# carver.dan1@gmail.com
###

library(tidyverse)
library(data.table)

#set base dir
base_dir <- "D:/cwrNA/occurrence_data2019_05_29/cwr_occ/20191114"
# Load in data
csvPath <- paste0(base_dir,"/colin_dan_data.csv")
data <- data.table::fread(csvPath, header = TRUE)
d1 <- head(data)


#filter by filename 
exclude <- read.csv(file = "D:/cwrNA/occurrence_data2019_05_29/cwr_occ/ExcludefromCWROCC.csv", header=TRUE) %>%
  filter(Exclude == "Y")

listExclude<- c("GBIF","raw_ATGGC_ff.xlsx","raw_AVRDC_ff.xlsx",
                "raw_Bioversity_EUR_SING_2013_1a_ff.xlsx",
                "raw_Bioversity_EUR_SING_2013_1b_ff.xlsx",
                "raw_Bioversity_EUR_SING_2013_1c_ff.xlsx",
                "raw_Bioversity_EUR_SING_2013_1d_ff.xlsx",
                "raw_Bioversity_EUR_SING_2013_1e_ff.xlsx",
                "raw_Bioversity_EUR_SING_2013_2a_ff.xlsx",
                "raw_Bioversity_EUR_SING_2013_2b_ff.xlsx",
                "raw_Bioversity_EUR_SING_2013_2c_ff.xlsx",
                "raw_Bioversity_EUR_SING_2013_2d_ff.xlsx",
                "raw_CATIE_2014_ff.xlsx",
                "raw_CK_USDA_NPGS_GRIN_USCWR.xlsx",
                "raw_Helianthus_Marek_newdata_ff",
                "raw_IRRI_newdata_ff.xlsx",
                "raw_USCWRExtras_CS_FF.xlsx",
                "raw_USDA_NPGS_GRIN_extras_ff.xlsx",
                "raw_USDA_NPGS_GRIN_FF.xlsx",
                "raw_USDA_NPGS_GRIN_Hijmans_ff.xlsx")  


data <- data[!data$filename %in% listExclude,]
#View(data)
dim(data)


# Select necessary columns from dataset
dataThin <- data %>%
  dplyr::select("id","source","provider_institute_id","f_x1_genus","f_x1_sp1","taxstand_final_taxon",
        "final_lon", "final_lat", "adm1", "adm2", "adm3", "adm4", "locality", "cultivated")
nr <- nrow(dataThin)

# filter by final cult status- not   "cultivated"
dataThin <- dataThin[!dataThin$final_cult_stat %in% c("cultivated"),]
dim(dataThin)

# define structure of the empty dataframe
df <- data.frame(taxon=character(nr),
                 genus=character(nr),
                 species=character(nr),
                 latitude=double(nr),
                 longitude=double(nr),
                 databaseSource=character(nr),
                 institutionCode=character(nr),
                 type=factor(nr),
                 uniqueID=factor(nr),
                 sampleCategory=character(nr),
                 country=character(nr),
                 iso3=character(nr),
                 localityInformation=character(nr),
                 stringsAsFactors=FALSE)

# assign columns to location in empty dataframe
df$taxon <- dataThin$taxstand_final_taxon
df$genus <- dataThin$f_x1_genus
df$species <- NA
df$latitude <- dataThin$final_lat
df$longitude <- dataThin$final_lon
df$databaseSource <- "cwr_occ"
df$institutionCode <- dataThin$provider_institute_id
df$type <- dataThin$source
df$uniqueID <- dataThin$id
df$sampleCategory <- NA
df$country <- NA
df$iso3 <- NA
df$localityInformation <- dataThin$locality
df$biologicalStatus <- NA
df$collectionSource <- NA
df$finalOriginStat <- NA
   

# Spilt name to get at genus and species
#test <- df[1:100,]
df$name <- df$taxon
df <- tidyr::separate(data = df, "name",into =c('genus1','spec','sub1','sub2','sub3'),sep='_')
#View(df[1:1000,])


# for what ever reason the first clause of this funtion is not working. I'm leaving it for now because
# I dont think we actaully need a acurate species to do the analysis
##
#View(df4[1:1000,])
# remove all NA
df6 <- str_replace_all(df$taxon, pattern = "_", replacement = " " )
#View(df6)
df$taxon <- df6
df1 <- df

df <- df1
# compile species 
for(i in 1:nrow(df)){
  if(is.na(df$sub1[i])){
    sps <- df$spec[i]
  }else{
    if(is.na(df$sub2[i])){
      sps <- paste(df$spec[i], df$sub1[i], sep = "_")
    }else{
    if(is.na(df$sub3[i])){
      sps <- paste(df$spec[i], df$sub1[i], df$sub2[i], sep = "_")
    }else{
      sps <- paste(df$spec[i], df$sub1[i], df$sub2[i],df$sub3[i], sep = "_")
      }
    }
  }
  sps <- str_replace_all(sps, pattern = "NA", replacement = "")
  sps <- str_replace_all(sps, pattern = "_", replacement = " ")
  df$species[i] <- sps
  print(i)
}

#View(test)

df2 <- subset(x = df, select = -c(genus1, spec,sub1,sub2,sub3) )
#View(df4[1:1000,])

# actual code
testLatLong <<- df2 %>%
  dplyr::select(c("uniqueID","latitude", "longitude")) %>%
  mutate(hasLat = !is.na(latitude) & latitude != "\\N" & latitude != "") %>%
  mutate(hasLong = !is.na(longitude) & longitude != "\\N"& longitude != "") %>%
  mutate(hasLatLong = hasLat & hasLong)

summariseErrors <- testLatLong %>%
  filter(hasLat == TRUE & hasLong ==FALSE | hasLat == FALSE & hasLong ==TRUE)

print(paste0("there are ", nrow(summariseErrors)," miss matach lat long pairs."))
write.csv(x = summariseErrors, file = paste0(base_dir,"/mismatchLatLong.csv"))


# complete the check synonym process
source(file="D:/cwrNA/src/dataPrep/dataBaseTransform/checkSynonymsFunction.R")
df3 <- checkSynonym(df2)

# write out the new dataframe
write.csv(x = df2, file = paste0(base_dir,"/refinedcwrOCC.csv"))
D:/cwrNA/src/dataPrep/dataBaseTransform/gbifTransform.R 

###
# reworking of wiews data
# 20190815
# carver.dan1@gmail.com
### 

library(tidyverse)
library(data.table)
library(readr)
library(sqldf)
library(MazamaSpatialUtils)
#set base dir
base_dir <- "D:/cwrNA/occurrence_data2019_05_29/GBIF"

# Load in data 
header <- read.csv(paste0(base_dir,"/wetransfer-51b480/head.csv"), header = TRUE, fileEncoding = 'UTF-16',sep = '\t')


csvPath <- paste0(base_dir,"/cleanGBIF20191119.csv")

# #old dataset 
# csv2 <- paste0(base_dir,"/occurrences_may2019.csv")
# # contain 46 columns so if need be we could apply header to this database with a few assumptions
# dOld <- data.table::fread(csv2, header= FALSE)
# #fread -- will not read in utf-16 
# data <- data.table::fread(csvPath, header = FALSE,)
# #readr - reads in the information but does writes it all as NA 
# ds <- readr::read_tsv(file = csvPath, n_max = 500,col_names = names(header))
# # rsql - still based on read.table so this did not work 
# df3 <- sqldf::read.csv.sql(file = csvPath, header = FALSE, sep ="\t", nrows = 1000)


# ### iterative process using read.csv https://stackoverflow.com/questions/9352887/strategies-for-reading-in-csv-files-in-pieces/30403877#30403877
# # establishing a connection to the file  
# con <- file(csvPath, "r", encoding = 'UTF-16')
# #close(con)
# # create a dataframe to bind outputs to 
# df2 <- data.frame()
# 
# rows <- 10000
# x =1 
# while(rows ==10000){
#   df <- read.csv(con,header = FALSE,fileEncoding = 'UTF-16',sep = '\t',nrows = 10000)
#   rows <- nrow(df)
#   print(rows)
#   colnames(df) <- names(header)
#   dataThin <- df %>%
#     dplyr::select("gbifID", "genus", "species", "infraspecificEpithet", "taxonRank",
#                   "countryCode", "locality", "stateProvince", "decimalLatitude", 
#                   "decimalLongitude", "basisOfRecord", "institutionCode" )
#   df2 <- rbind(df2, dataThin)
#   x = x+1 
#   print(x)
# }
# # write out df2 as new GBIF dataset 
# write.csv(x = df2, file = paste0(base_dir, '/cleanGBIF20191119.csv'))
# dim(df2)

### read in clean data 
df2 <- data.table::fread(input = csvPath, header = TRUE)

# send colin a list of unique Basis of records so he can filter 
instituteCodes <- sort(unique(df2$institutionCode))
write.csv(x = instituteCodes, file = paste0(base_dir, '/uniqueInstituteCodesGBIF20191120.csv'))
# excluse values based on colins recommidation 
exclude <- read.csv(paste0(base_dir, '/uniqueInstituteCodesGBIF20191120_ck.csv')) %>%
  dplyr::filter(exclude=="Y")

df3 <- df2[!df2$institutionCode %in% exclude$x,]
dim(df3)
# delete basis of records == 'fossil specimen'
df3 <- df3[df3$basisOfRecord != "FOSSIL_SPECIMEN",]
dim(df3)
# replace ISO2 with ISO3; lots of steps but it seems to be working 
naCC <- df3 %>%
  dplyr::filter(is.na(countryCode))
nonNACC <- df3 %>% 
  dplyr::filter(!is.na(countryCode))
iso2s <- nonNACC %>%
  dplyr::filter(nchar(countryCode) == 2)
iso2s$countryCode <- MazamaSpatialUtils::iso2ToIso3(iso2s$countryCode)
badIso2 <- nonNACC %>%
  dplyr::filter(nchar(countryCode)>2)
badIso2$countryCode <- NA 

df4 <- rbind(naCC, iso2s, badIso2)

# remove values based on lat long limits 
nolatLong <- df4 %>%
  filter(is.na(decimalLatitude) | is.na(decimalLongitude))
latLong <- df4 %>%
  filter(!is.na(decimalLatitude) & !is.na(decimalLongitude)) %>%
  filter(decimalLatitude > 10)%>%
  filter(decimalLongitude < -50)

df5 <- rbind(nolatLong, latLong)


# construct Locality 
df5 <- tidyr::unite(data = df5, 'local2', stateProvince, locality, sep = " -- ")

### construct species -- this process drops some rows where taxonRank is not species, subspecies, or varity 
# I beleive that most of those are data errors anyway so I'm ok with it. 
# for species only data 
sp1 <- df5 %>%
  filter(taxonRank == "SPECIES")
sp1$taxon <- sp1$species
sp1 <- sp1 %>% tidyr::separate(col = species, c("genus1", "species1"), sep = " " )
sp1$species <- sp1$species1

# for subsp and var 
sp2 <- df5 %>%
  filter(taxonRank %in% c("SUBSPECIES", "VARIETY"))
  
sp2$taxonRank <- gsub(pattern = "SUBSPECIES", replacement = "subsp.",x = sp2$taxonRank)
sp2$taxonRank <- gsub(pattern = "VARIETY", replacement = "var.",x = sp2$taxonRank)

sp2 <- sp2 %>% tidyr::separate(col = species, c("genus1", "species1"), sep = " " )
sp2$species <- paste0(sp2$species1, " ", sp2$taxonRank, " ", sp2$infraspecificEpithet)

sp2$taxon <- paste0(sp2$genus, " ", sp2$species)

# join the two datasets 
df5 <- rbind(sp1, sp2)

# construct taxon 


nr <- nrow(df5)
# define structure of the empty dataframe 
df <- data.frame(taxon=character(nr),
                 genus=character(nr),
                 species=character(nr),
                 latitude=double(nr),
                 longitude=double(nr),
                 databaseSource=character(nr),
                 institutionCode=character(nr),
                 type=factor(nr),
                 uniqueID=factor(nr),
                 sampleCategory=character(nr),
                 country=character(nr),
                 iso3=character(nr),
                 localityInformation=character(nr),
                 biologicalStatus = character(nr), 
                 collectionSource = character(nr),
                 finalOriginStat = character(nr),
                 stringsAsFactors=FALSE)

# assign columns to location in empty dataframe
df$taxon <- df5$taxon
df$genus <- df5$genus
df$species <- df5$species
df$latitude <- df5$decimalLatitude
df$longitude <- df5$decimalLongitude
df$databaseSource <- "GBIF"
df$institutionCode <- df5$institutionCode
df$type <- NA
df$uniqueID <- as.factor(df5$gbifID)
df$sampleCategory <- df5$basisOfRecord
df$country <- NA
df$iso3 <- df5$countryCode
df$localityInformation <- NA
df$biologicalStatus <- NA
df$collectionSource <- NA
df$finalOriginStat <- NA 


# construct type 
# Add type field. Make type = G when basisofrecord = living specimen; otherwise H
h<- df %>%
  filter(sampleCategory != "LIVING_SPECIMEN")
h$type <- "H"
g <- df %>%
  filter(sampleCategory == "LIVING_SPECIMEN")
g$type <- "G"

df <- rbind(h,g)

# pull in checkSynomyn function and apply it 
source(file="D:/cwrNA/src/dataPrep/dataBaseTransform/checkSynonymsFunction.R")
df <- checkSynonym(df)

# actual code 
testLatLong <<- df %>%
  dplyr::select(c("uniqueID","latitude", "longitude")) %>%
  mutate(hasLat = !is.na(latitude) & latitude != "\\N" & latitude != "") %>%
  mutate(hasLong = !is.na(longitude) & longitude != "\\N"& longitude != "") %>%
  mutate(hasLatLong = hasLat & hasLong)

summariseErrors <- testLatLong %>%
  filter(hasLat == TRUE & hasLong ==FALSE | hasLat == FALSE & hasLong ==TRUE)

print(paste0("there are ", nrow(summariseErrors)," miss matach lat long pairs."))
write.csv(x = summariseErrors, file = paste0(base_dir,"/mismatchLatLong.csv"))


# write out the new dataframe 
write.csv(x = df, file = paste0(base_dir,"/refinedGBIF.csv"))

D:/cwrNA/src/dataPrep/dataBaseTransform/genesysTransform.R 

###
# reworking of genesys data
# 20190815
# carver.dan1@gmail.com
### 

library(tidyverse)
library(data.table)

#set base dir
base_dir <- "D:/cwrNA/occurrence_data2019_05_29/Genesys"

# Load in data 
csvPath <- paste0(base_dir,"/genesys-accessions-v18461d86381494a49a0faaa913f74c3da.csv")
data <- data.table::fread(csvPath, header = TRUE)#87940    43 
# Select necessary columns from dataset 
dataThin <- data %>%
  select("INSTCODE", "ACCENUMB", "GENUS", 
         "SPECIES", "SUBTAXA",
         "ORIGCTY","COLLSITE","DECLATITUDE","DECLONGITUDE", "SAMPSTAT", "COLLSRC",
         "HISTORIC")
nr <- nrow(dataThin)

# define structure of the empty dataframe 
df <- data.frame(taxon=character(nr),
                 genus=character(nr),
                 species=character(nr),
                 latitude=double(nr),
                 longitude=double(nr),
                 databaseSource=character(nr),
                 institutionCode=character(nr),
                 type=factor(nr),
                 uniqueID=factor(nr),
                 sampleCategory=character(nr),
                 country=character(nr),
                 iso3=character(nr),
                 localityInformation=character(nr),
                 biologicalStatus = character(nr), 
                 collectionSource = character(nr),
                 finalOriginStat = character(nr),
                 stringsAsFactors=FALSE)

# assign columns to location in empty dataframe
df$taxon <- NA
df$genus <- dataThin$GENUS
df$species <- dataThin$SPECIES
df$latitude <- dataThin$DECLATITUDE
df$longitude <- dataThin$DECLONGITUDE
df$databaseSource <- "Genesys"
df$institutionCode <- dataThin$INSTCODE
df$type <- NA
df$uniqueID <- dataThin$ACCENUMB
df$sampleCategory <- dataThin$SAMPSTAT
df$country <- NA
df$iso3 <- dataThin$ORIGCTY
df$localityInformation <- dataThin$COLLSITE
df$biologicalStatus <- NA
df$collectionSource <- dataThin$COLLSRC
df$finalOriginStat <- NA
df$historicTemp <- dataThin$HISTORIC
dim(df)

# define species by combining species and subspecies categories 
for(i in 1:nrow(dataThin)){
  if(dataThin$SUBTAXA[i]!= ""){
    df$species[i] <- paste(dataThin$SPECIES[i],dataThin$SUBTAXA[i], sep=" ")
  }
}

# generate taxon by combining genus and species  
df$taxon <- paste0(df$genus," ", df$species)


# SAMPSTATRemove non wild- exclude 300 and above (but include 999)
df <- df[df$sampleCategory %in% c(999,100,110,120, 130, 200, 999, NA),]
dim(df)

# COLLSRC- exclude 30, 40, 50
df <- df[!df$collectionSource %in% c(30, 40, 50),]
dim(df)

# exclude all data from USDA collections 
USDAcodes <- c("USA003" ,"USA004", "USA005" ,"USA016" ,"USA020",
"USA022", "USA026", "USA028", "USA029", "USA042" ,"USA047", "USA049",
"USA074", "USA108", "USA133", "USA148", "USA151", "USA167", "USA176",
 "USA390", "USA955", "USA956", "USA970", "USA971", "USA995")

df <- df[!df$institutionCode %in% USDAcodes,]
dim(df)


# test for historic sample 
for(i in 1:nrow(df)){
  if(dataThin$HISTORIC[i] == TRUE){
    df$type[i] <- "H"
  }else{
    df$type[i] <- "G"
  }
}


# pull in checkSynomyn function and apply it 
source(file="D:/cwrNA/src/dataPrep/dataBaseTransform/checkSynonymsFunction.R")
df <- checkSynonym(df)
dim(df)


# actual code 
testLatLong <<- df %>%
  dplyr::select(c("uniqueID","latitude", "longitude")) %>%
  mutate(hasLat = !is.na(latitude) & latitude != "\\N" & latitude != "") %>%
  mutate(hasLong = !is.na(longitude) & longitude != "\\N"& longitude != "") %>%
  mutate(hasLatLong = hasLat & hasLong)

summariseErrors <- testLatLong %>%
  filter(hasLat == TRUE & hasLong ==FALSE | hasLat == FALSE & hasLong ==TRUE)

print(paste0("there are ", nrow(summariseErrors)," miss matach lat long pairs."))
write.csv(x = summariseErrors, file = paste0(base_dir,"/mismatchLatLong.csv"))


# write out the new dataframe 
write.csv(x = df, file = paste0(base_dir,"/refinedGenesys.csv"))


D:/cwrNA/src/dataPrep/dataBaseTransform/idigbioTransform.R 

###
# reworking of idigbio data
# 20190815
# carver.dan1@gmail.com
### 

library(tidyverse)
library(data.table)

#set base dir
base_dir <- "D:/cwrNA/occurrence_data2019_05_29/idigbio"

"D:\cwrNA\occurrence_data2019_05_29\idigbio\occurrence.csv"

# Load in data 
csvPath <- paste0(base_dir,"/occurrence.csv")
data <- data.table::fread(csvPath, header = TRUE)
# Select necessary columns from dataset 
dataThin <- data %>%
  select("coreid","dwc:basisOfRecord", "gbif:canonicalName",
        "dwc:country","idigbio:isoCountryCode","dwc:genus", "idigbio:geoPoint")
nr <- nrow(dataThin)

# define structure of the empty dataframe 
df <- data.frame(taxon=character(nr),
                 genus=character(nr),
                 species=character(nr),
                 latitude=double(nr),
                 longitude=double(nr),
                 databaseSource=character(nr),
                 institutionCode=character(nr),
                 type=factor(nr),
                 uniqueID=factor(nr),
                 sampleCategory=character(nr),
                 country=character(nr),
                 iso3=character(nr),
                 localityInformation=character(nr),
                 stringsAsFactors=FALSE)

# assign columns to location in empty dataframe
df$taxon <- dataThin$`gbif:canonicalName`
df$genus <- dataThin$`dwc:genus`
df$species <- NA
df$latitude <- NA
df$longitude <- NA
df$databaseSource <- "idigbio " 
df$institutionCode <- NA
df$type <- "H"
df$uniqueID <- dataThin$coreid
df$sampleCategory <- dataThin$`dwc:basisOfRecord`
df$country <- dataThin$`dwc:country`
df$iso3 <- dataThin$`idigbio:isoCountryCode`
df$localityInformation <- NA

# define species by combining species and subspecies categories 
df$taxon2 <- df$taxon
df <- tidyr::separate(data = df, 'taxon2', c("Genus", "Species", "middle",
                                                                 "subSpe", "subSpe2",sep=" "))


# for(i in 1:10){
#   if(is.na(ele2$middle[i])){
#     df$species[i] <- ele2$Species[i]
#   }
#   if(!is.na(ele2$middle[i])){
#     df$species[i] <- paste(ele2$Species[i],ele2$middle[i],
#                            ele2$subSpe[i],ele2$subSpe2[i], sep="_")
#   }
# }

#rewriting this with no for loop              
setSpecies <- function(dataFrame){
  if(!is.na(dataFrame$middle)){
    dataFrame$species <- paste(dataFrame$Species,dataFrame$middle,
                               dataFrame$subSpe,dataFrame$subSpe2, sep="_")
  }
  if(is.na(dataFrame$middle)){
    dataFrame$species <- dataFrame$Species
  }
  return(dataFrame)
}
df4 <- setSpecies(df)

# split out the lat long data 
df6 <- str_remove_all(dataThin$`idigbio:geoPoint`, '"\"') %>%
  str_remove_all("lat:") %>%
  str_remove_all("lon:") %>%
  str_remove_all("\\{|\\}") %>%
  as.data.frame()
colnames(df6)<- "v1"

df6 <- separate(data = df6,col = "v1",into = c("lat","long"),sep = ",")
df6 
df4$latitude <- df6$lat
df4$longitude <- df6$long
#drop extra columns 
df4 <- subset(df4, select = -c(Genus,Species,middle,subSpe,
                             subSpe2))

df4<-subset(df4,select = -c(ncol(df4)))
head(df4)


# actual code 
testLatLong <<- df4 %>%
  dplyr::select(c("uniqueID","latitude", "longitude")) %>%
  mutate(hasLat = !is.na(latitude) & latitude != "\\N" & latitude != "") %>%
  mutate(hasLong = !is.na(longitude) & longitude != "\\N"& longitude != "") %>%
  mutate(hasLatLong = hasLat & hasLong)

summariseErrors <- testLatLong %>%
  filter(hasLat == TRUE & hasLong ==FALSE | hasLat == FALSE & hasLong ==TRUE)

print(paste0("there are ", nrow(summariseErrors)," miss matach lat long pairs."))
write.csv(x = summariseErrors, file = paste0(base_dir,"/mismatchLatLong.csv"))


# write out the new dataframe 
write.csv(x = df4, file = paste0(base_dir,"/refinedIdigBio.csv"))
D:/cwrNA/src/dataPrep/dataBaseTransform/midwestHerbTransform.R 

###
# reworking of idigbio data
# 20190815
# carver.dan1@gmail.com
### 

library(tidyverse)
library(data.table)

#set base dir
base_dir <- "D:/cwrNA/occurrence_data2019_05_29/Midwest_herbaria/SymbOutput_2019-05-28_151547_DwC-A"

# Load in data 
csvPath <- paste0(base_dir,"/occurrences.csv")
data <- data.table::fread(csvPath, header = TRUE)
# Select necessary columns from dataset 
dataThin <- data %>%
  select("id","institutionCode", "basisOfRecord","occurrenceID",
        "scientificName","genus","specificEpithet", "taxonRank" , "infraspecificEpithet", 
        "country","stateProvince","county", "municipality", "locality",
        "decimalLatitude","decimalLongitude")
nr <- nrow(dataThin)

# define structure of the empty dataframe 
df <- data.frame(taxon=character(nr),
                 genus=character(nr),
                 species=character(nr),
                 latitude=double(nr),
                 longitude=double(nr),
                 databaseSource=character(nr),
                 institutionCode=character(nr),
                 type=factor(nr),
                 uniqueID=factor(nr),
                 sampleCategory=character(nr),
                 country=character(nr),
                 iso3=character(nr),
                 localityInformation=character(nr),
                 biologicalStatus = character(nr), 
                 collectionSource = character(nr),
                 finalOriginStat = character(nr),
                 stringsAsFactors=FALSE)

# assign columns to location in empty dataframe
df$taxon <- dataThin$scientificName
df$genus <- dataThin$genus
df$species <- NA
df$latitude <- dataThin$decimalLatitude
df$longitude <- dataThin$decimalLongitude
df$databaseSource <- "midwestHerbarium " 
df$institutionCode <- dataThin$institutionCode
df$type <- "H"
df$uniqueID <- dataThin$id
df$sampleCategory <- dataThin$basisOfRecord
df$country <- dataThin$country
df$iso3 <- NA
df$localityInformation <- NA
df$biologicalStatus <- NA
df$collectionSource <- NA
df$finalOriginStat <- NA 

# Locality in formation, cacatanate "country","stateProvince","county", "municipality", "locality",
d2 <- dataThin %>% tidyr::unite("local2" , country,stateProvince,county, municipality, locality, sep = " -- ")
df$localityInformation <- d2$local2

# Species - concatenate "genus","specificEpithet", "taxonRank" , "infraspecificEpithet"
d3 <- dataThin %>% tidyr::unite("fullSpecies", specificEpithet, taxonRank , infraspecificEpithet, sep = " ")
df$species <- d3$fullSpecies

# pull in checkSynomyn function and apply it 
source(file="D:/cwrNA/src/dataPrep/dataBaseTransform/checkSynonymsFunction.R")
df <- checkSynonym(df)


# actual code 
testLatLong <<- df %>%
  dplyr::select(c("uniqueID","latitude", "longitude")) %>%
  mutate(hasLat = !is.na(latitude) & latitude != "\\N" & latitude != "") %>%
  mutate(hasLong = !is.na(longitude) & longitude != "\\N"& longitude != "") %>%
  mutate(hasLatLong = hasLat & hasLong)

summariseErrors <- testLatLong %>%
  filter(hasLat == TRUE & hasLong ==FALSE | hasLat == FALSE & hasLong ==TRUE)

print(paste0("there are ", nrow(summariseErrors)," miss matach lat long pairs."))
write.csv(x = summariseErrors, file = paste0(base_dir,"/mismatchLatLong.csv"))


# write out the new dataframe 
write.csv(x = df, file = paste0(base_dir,"/refinedMidwestHerbarium.csv"))
D:/cwrNA/src/dataPrep/dataBaseTransform/usdaGrin.R 

###
# reworking of idigbio data
# 20190815
# carver.dan1@gmail.com
### 

library(tidyverse)
library(data.table)

#set base dir
base_dir <- "D:/cwrNA/occurrence_data2019_05_29/USDA_NPGS_GRINGlobal"

# Load in data 
csvPath <- paste0(base_dir,"/USDA_CWRofUSA.csv")
data <- data.table::fread(input = csvPath,header = TRUE)
# Select necessary columns from dataset 
dataThin <- data %>%
  select("Taxon","accession_number","status_code", "site_short_name","improvement_status_code",
         "country", "latitude","longitude","formatted_locality")
nr <- nrow(dataThin)

# define structure of the empty dataframe 
df <- data.frame(taxon=character(nr),
                 genus=character(nr),
                 species=character(nr),
                 latitude=double(nr),
                 longitude=double(nr),
                 databaseSource=character(nr),
                 institutionCode=character(nr),
                 type=factor(nr),
                 uniqueID=factor(nr),
                 sampleCategory=character(nr),
                 country=character(nr),
                 iso3=character(nr),
                 localityInformation=character(nr),
                 biologicalStatus = character(nr), 
                 collectionSource = character(nr),
                 finalOriginStat = character(nr),
                 stringsAsFactors=FALSE)

# assign columns to location in empty dataframe
df$taxon <- dataThin$Taxon
df$genus <- NA
df$species <- NA
df$latitude <- dataThin$latitude
df$longitude <- dataThin$longitude
df$databaseSource <- "USDA_NPGS_GRINGlobal" 
df$institutionCode <- dataThin$site_short_name
df$type <- NA
df$uniqueID <- dataThin$accession_number
df$sampleCategory <- as.character(dataThin$status_code)
df$country <- dataThin$country
df$iso3 <- NA
df$localityInformation <- dataThin$formatted_locality
df$biologicalStatus <- dataThin$improvement_status_code
df$collectionSource <- NA
df$finalOriginStat <- NA 

# pull in checkSynomyn function and apply it 
source(file="D:/cwrNA/src/dataPrep/dataBaseTransform/checkSynonymsFunction.R")
df <- checkSynonym(df)

# applying G or H value based on status category 

for(i in 1:nrow(df)){
  if(df$sampleCategory[i] =="INACTIVE"){
    df$type[i] <- "H"
  }else{
    df$type[i] <- "G"
  }
}

# filter out for only wild and unknow locations 
df <- df[df$biologicalStatus %in% c("WILD","UNCERTAIN",""),]


# Spilt name to get at genus and species 
#test <- df[1:100,]
df$name <- df$taxon
df <- tidyr::separate(data = df, "name",into =c('genus','spec','sub1','sub2'),sep=' ')

#Function to split full name into taxon/species              
setSpecies <- function(dataFrame){
  if(!is.na(dataFrame$sub1)){
    dataFrame$species <- paste(dataFrame$spec,dataFrame$sub1,
                               dataFrame$sub2, sep="_")
  }
  if(is.na(dataFrame$sub1)){
    dataFrame$species <- dataFrame$spec
  }
  return(dataFrame)
}
df4 <- setSpecies(df)

# remove all NA 
df6 <- str_remove_all(df4$species, 'NA') %>%
  str_remove_all("__")
df4$species <- df6

# # Set type based on value in status code 
# setType <- function(dataFrame){
#   if(is.na(dataFrame$sampleCategory)){
#     dataFrame$type <- "G"
#   }else(
#     if(dataFrame$sampleCategory == "INACTIVE"){
#       dataFrame$type <- "H"
#     }else{
#       dataFrame$type <- "G"
#     }
#       
#   )
#   
#   return(dataFrame)
# }
# 
# df4 <- setType(df4)
df4 <- subset(x = df4, select = -c(spec,sub1,sub2) )

testLatLong <<- df4 %>%
  dplyr::select(c("latitude", "longitude")) %>%
  mutate(hasLat = !is.na(latitude) & latitude != "\\N" & latitude != "") %>%
  mutate(hasLong = !is.na(longitude) & longitude != "\\N"& longitude != "") %>%
  mutate(hasLatLong = hasLat & hasLong)

summariseErrors <- testLatLong %>%
  filter(hasLat == TRUE & hasLong ==FALSE | hasLat == FALSE & hasLong ==TRUE)

print(paste0("there are ", nrow(summariseErrors)," miss matach lat long pairs."))


# write out the new dataframe 
write.csv(x = df4, file = paste0(base_dir,"/refinedUSDAGrin.csv"))
D:/cwrNA/src/dataPrep/dataBaseTransform/wiewsTransform.R 

###
# reworking of idigbio data
# 20190815
# carver.dan1@gmail.com
### 

library(tidyverse)
library(data.table)

#set base dir
base_dir <- "D:/cwrNA/occurrence_data2019_05_29/WIEWS"

# Load in data 
csvPath <- paste0(base_dir,"/COLIN_GENUS.csv")
data <- data.table::fread(csvPath, header = TRUE) # 1516974      21   fill = TRUE just ends rsesion
## issues with read table... sticking with fread for now. 
# data2 <- read.table(csvPath, sep = "\t",header = TRUE, fill=TRUE) # 753805     21 line 376281 did not have 21 elements

#View(data)
# Select necessary columns from dataset 
dataThin <- data %>%
  dplyr::select("Country of origin (ISO3)", "Country of origin", "Holding institute code",
        "Accession number","Taxon","Latitude of collecting site (decimal degrees format)",
        "Longitude of collecting site (decimal degrees format)", "Type of germplasm storage",
        "Source of information", "Collecting/acquisition source", "Biological status","Source of information" )

# select out FAO-WIEWS from column 21 due to over lap with other databases 
# Use biological Status to select "", "100) Wild" ,"200) Weedy"   
dataThin <- dataThin %>%
  filter(`Source of information` == "FAO-WIEWS" |`Source of information` == "") %>%
  filter(`Biological status` == "100) Wild" | `Biological status` == "200) Weedy" | `Biological status` == "")

# filter for value based on preservation status 
dataThin <- dataThin[dataThin$`Type of germplasm storage` %in% c("20) Field","13) Seed long-term", "","12) Seed medium-term"),]

nr <- nrow(dataThin)


# define structure of the empty dataframe 
df <- data.frame(taxon=character(nr),
                 genus=character(nr),
                 species=character(nr),
                 latitude=double(nr),
                 longitude=double(nr),
                 databaseSource=character(nr),
                 institutionCode=character(nr),
                 type=factor(nr),
                 uniqueID=factor(nr),
                 sampleCategory=character(nr),
                 country=character(nr),
                 iso3=character(nr),
                 localityInformation=character(nr),
                 biologicalStatus = character(nr),
                 collectionSource = character(nr),
                 finalOriginStat = character(nr),
                 stringsAsFactors=FALSE)

# assign columns to location in empty dataframe
df$taxon <- dataThin$Taxon
df$genus <- NA
df$species <- NA
df$latitude <- dataThin$`Latitude of collecting site (decimal degrees format)`
df$longitude <- dataThin$`Longitude of collecting site (decimal degrees format)`
df$databaseSource <- "wiews"
df$institutionCode <- dataThin$`Holding institute code`
df$type <- "G"
df$uniqueID <- dataThin$`Accession number`
df$sampleCategory <- dataThin$`Type of germplasm storage`
df$country <- NA
df$iso3 <- NA
df$localityInformation <- dataThin$`Collecting/acquisition source`
df$biologicalStatus <- dataThin$`Biological status`
df$collectionSource <- dataThin$`Collecting/acquisition source`
df$finalOriginStat <- NA 

# pull in checkSynomyn function and apply it 
source(file="D:/cwrNA/src/dataPrep/dataBaseTransform/checkSynonymsFunction.R")
df <- checkSynonym(df)

# Spilt name to get at genus and species 
#test <- df[1:100,]
df$name <- df$taxon
df <- tidyr::separate(data = df, "name",into =c('genus','spec','sub1','sub2','sub3', 'sub4'),sep=' ')
#View(df)


#Function to split full name into taxon/species              
setSpecies <- function(dataFrame){
  if(!is.na(dataFrame$sub1)){
    dataFrame$species <- paste(dataFrame$spec,dataFrame$sub1,
                               dataFrame$sub2, sep="_")
  }
  if(is.na(dataFrame$sub1)){
    dataFrame$species <- dataFrame$spec
  }
  return(dataFrame)
}
df4 <- setSpecies(df)

# remove all NA 
df6 <- str_remove_all(df4$species, 'NA') %>%
  str_remove_all("__")
df4$species <- df6
df4 <- subset(x = df4, select = -c(spec,sub1,sub2) )

# test for mis matched latlong values 
testLatLong <- df4 %>%
  dplyr::select(c("uniqueID","latitude", "longitude")) %>%
  mutate(hasLat = !is.na(latitude) & latitude != "\\N" & latitude != "") %>%
  mutate(hasLong = !is.na(longitude) & longitude != "\\N"& longitude != "") %>%
  mutate(hasLatLong = hasLat & hasLong)

summariseErrors <- testLatLong %>%
  filter(hasLat == TRUE & hasLong ==FALSE | hasLat == FALSE & hasLong ==TRUE)

print(paste0("there are ", nrow(summariseErrors)," miss matach lat long pairs."))
write.csv(x = summariseErrors, file = paste0(base_dir,"/mismatchLatLong.csv"))


# write out the new dataframe 
write.csv(x = df4, file = paste0(base_dir,"/refinedWiews.csv"))


D:/cwrNA/src/dataPrep/developCounts.r 

####
# Based on the raw data, summarize by type and presence of lat long
# dan.carver@carverd.com
# 20200414
###

developCounts <- function(species){
    # define presence of usable lat long values
    dataThin <<- rawData %>%
      dplyr::select(c("taxon", "latitude", "longitude", "type","databaseSource")) %>%
      mutate(hasLat = !is.na(latitude) & latitude != "\\N" & latitude != "" & !is.null(latitude) & latitude != "NULL") %>%
      mutate(hasLong = !is.na(longitude) & longitude != "\\N"& longitude != "" & !is.null(longitude)& longitude != "NULL") %>%
      mutate(hasLatLong = hasLat & hasLong)

      # set column names for counts df
    colNames <- c("species","totalRecords",	"hasLat", "hasLong","totalUseful", 	"totalGRecords",
                  "totalGUseful","totalHRecords",	"totalHUseful","numberOfUniqueSources",
    # summarize data
    tbl <- dataThin %>%
      dplyr::group_by(type, hasLatLong ) %>%
      dplyr::summarize(total = n())
    # generate counts df
    countsData <- data.frame(matrix(NA, nrow = 1, ncol = 13))
    colnames(countsData) <- colNames
    #assign values to counts df
    countsData$species <- unique(dataThin$taxon)
    countsData$totalRecords <- nrow(dataThin)
    countsData$totalUseful <- sum((subset(tbl, hasLatLong == TRUE))$total)
    countsData$totalGRecords <- sum((subset(tbl, type == "G"))$total)
    countsData$totalGUseful <- sum((subset(tbl, type == "G" & hasLatLong == TRUE))$total)
    countsData$totalHRecords <- sum((subset(tbl, type == "H"))$total)
    countsData$totalHUseful <- sum((subset(tbl, type == "H" & hasLatLong == TRUE))$total)
    countsData$hasLat <- sum(dataThin$hasLat)
    countsData$hasLong <- sum(dataThin$hasLong)
    countsData$numberOfUniqueSources <- n_distinct(rawData$databaseSource)
    # these values are reassigned in 'addNorthAmericanPointstoCounts.r'
    countsData$NA_occurrences <- 0
    countsData$NA_GUseful <- 0
    countsData$NA_HUseful <- 0

    write.csv(countsData, file = paste0(sp_dir,"/counts.csv"),row.names = FALSE)
}
D:/cwrNA/src/dataPrep/developRaw.r 

###
# Produces the raw data for each species
# dan.carver@carved.com
# 20200414
###

developRaw <- function(species) {
    # from the occurence data select specific species and write out raw dataset
    rawData <<- genusOcc[genusOcc$taxon == species, ]
    write.csv(rawData, file = paste0(sp_dir,"/occurrences/rawData.csv"),row.names = FALSE)
}
D:/cwrNA/src/dataPrep/genCounts.r 

####

# 10/15/2018

# The goal of this work is to generate a tab seperated files that contains counts of multiple paraments as

# defined by the aichi docs


testLatLong <- function(species){
    dataThin <<- rawData %>%
      dplyr::select(c("taxon", "latitude", "longitude", "type","databaseSource")) %>%
      mutate(hasLat = !is.na(latitude) & latitude != "\\N" & latitude != "" & !is.null(latitude) & latitude != "NULL") %>%
      mutate(hasLong = !is.na(longitude) & longitude != "\\N"& longitude != "" & !is.null(longitude)& longitude != "NULL") %>%
      mutate(hasLatLong = hasLat & hasLong)


    colNames <- c("species","totalRecords",	"hasLat", "hasLong","totalUseful", 	"totalGRecords",
                  "totalGUseful","totalHRecords",	"totalHUseful","numberOfUniqueSources" )
    # this was removing all reconds with no lat long, make the total number and the total number of useful elements equal
    #noNas <- dataThin[complete.cases(dataThin),]
    tbl <- dataThin %>%
      dplyr::group_by(type, hasLatLong ) %>%
      dplyr::summarize(total = n())

    countsData <<- data.frame(matrix(NA, nrow = 1, ncol = 10))
    colnames(countsData) <- colNames
    countsData$species <- unique(dataThin$taxon)
    countsData$totalRecords <- nrow(dataThin)
    countsData$totalUseful <- sum((subset(tbl, hasLatLong == TRUE))$total)
    countsData$totalGRecords <- sum((subset(tbl, type == "G"))$total)
    countsData$totalGUseful <- sum((subset(tbl, type == "G" & hasLatLong == TRUE))$total)
    countsData$totalHRecords <- sum((subset(tbl, type == "H"))$total)
    countsData$totalHUseful <- sum((subset(tbl, type == "H" & hasLatLong == TRUE))$total)
    countsData$hasLat <- sum(dataThin$hasLat)
    countsData$hasLong <- sum(dataThin$hasLong)
    countsData$numberOfUniqueSources <- n_distinct(rawData$databaseSource)


    write.csv(countsData, file = paste0(sp_dir,"/counts.csv"),row.names = FALSE)
}
D:/cwrNA/src/dataPrep/nat_area_shp.r 

###
# generate the native area mask based on intersection between points, ecoregions, and countries
# dan.carver@carverd.com
# 20200414
###


nat_area_shp <- function(species) {
  # clause for seeing if the product already exist
  if (file.exists(paste0(sp_dir, "/modeling/nativeArea/narea.shp"))){
    nativeArea <<-readOGR(paste0(sp_dir, "/modeling/nativeArea/narea.shp"),verbose = FALSE)}
  else{
    # define CRS to be equal between points and ecoRegions
    crs(cleanPoints) <- crs(ecoReg)
    # test to see which ecoregions have points within them
    ecoVal <- data.frame(over(x = cleanPoints, y = ecoReg))%>%
      dplyr::select(ECO_ID_U )%>%
      distinct()%>%
      drop_na()
    #Probably don't need this cause, as all occurrence should be land points,
    # but it's an easy check
    if(length(ecoVal$ECO_ID_U) == 0 ){
      print(paste0("No ecoregions intersected with the occurence data. Species can not be modeled."))
      }else{
        # subset ecoRegions that have points within them
        ecoAreas <- subset(ecoReg, ECO_ID_U %in% ecoVal$ECO_ID_U)
        # clip ecoregions to countries with points present
        clipArea <-rgeos::gIntersection(ecoAreas, naSHP)
        nativeArea <<- SpatialPolygonsDataFrame(clipArea, data.frame(ID=1:length(clipArea)))

        # write out spatail feature
        # I was having issues with writeOGR and providing the full file path, This
        # should be cleaned up as setwd could cause issues down the line
        setwd(paste0(sp_dir, "/modeling/nativeArea"))
        writeOGR(obj=nativeArea, dsn="narea.shp", layer="narea", driver="ESRI Shapefile") # this is in geographical projection
    }
  }
}
D:/cwrNA/src/dataPrep/speciesCheckWithUSDAGrin.R 

###
# check all the names in the species list against the information we have from GRIN about native area
# 20200203
# dan.carver@carverd.com
###

#grin <- read.csv("D:/cwrNA/parameters/statePerTaxon/CWRofUSA_nativeareas_2020_1_30.csv")
#tGrin <- unique(grin$name)
#taxonGrin <- as.data.frame(tGrin)
#colnames(taxonGrin) <- "taxon"


#occData <<- data.table::fread("D:/cwrNA/occurrence_data2019_05_29/combinedOccurance2020-01-11.csv",header = TRUE)
#tOcc <- unique(occData$taxon)
#taxonOcc <- as.data.frame(tOcc)
#colnames(taxonOcc) <- "taxon"


## species with no matched data in  data in grin
#notInGrin <- tOcc[!tOcc %in% tGrin]
#### need to manually add these species to the spread sheet
#notInGrin


## species with no occ data
#noOcc <- tGrin[!tGrin %in% tOcc]
#noOcc
# drop cucurbitas and capsicum
#noOcc1 <- noOcc[-c(2,3,4,5,6)]
#noOcc1
D:/cwrNA/src/dataPrep/spPoints.r 

###
# call in the raw data and generate a spatial points dataframe
# dan.carver@carverd.com
# 20200414
###

spPoints <- function(species){
    # select all  rows with valid lat long
    latLong <- dataThin %>%
      dplyr::filter(hasLatLong == TRUE)
    latLong$latitude <- as.numeric(as.character(latLong$latitude))
    latLong$longitude <- as.numeric(as.character(latLong$longitude))
    latLong <- latLong[complete.cases(latLong[ , 6:7]),]

    if(nrow(latLong)>0){
      #https://stackoverflow.com/questions/3418128/how-to-convert-a-factor-to-integer-numeric-without-loss-of-information
      coord <- latLong %>% dplyr::select(longitude,latitude)

      spPoint <<- sp::SpatialPointsDataFrame(coord,
                                        data = latLong)
      # clause for no lat long data
      if( nrow(spPoint@data) == 0){
        print("there are no coodinate pairs for this species")
        spPoint <<- "no data available"
      }
      # mask to North America
      crs(spPoint) <- crs(naSHP)
      intersect1 <- intersect(spPoint, naSHP)
      if(nrow(intersect1)==0){
        spPoint <<- "no data available"
      }else{
        spPoint <<- intersect1
        write.csv(spPoint@data,file = paste0(sp_dir,"/occurrences/rawDataForNA.csv"),row.names = FALSE)
      }
    }else{
      print("there are no coodinate pairs for this species")
      spPoint <<- "no data available"
    }
}
D:/cwrNA/src/dataPrep/subSampleByCountry.r 

###
# sub sample feautres based on country if there are over 2000
# 20200414
# dan.carver@carverd.com
# based on work by
# Maria Victoria Diaz
# CIAT, 2018
###

subSampleByCountry<-function(species){
  set.seed(1234)
  # the 2000 occurrence limit only applies to H points. This spilts the data.
  d1 <- cleanPoints@data %>%
    dplyr::filter(type == "H")
  dG <- cleanPoints@data %>%
    dplyr::filter(type == "G")
  # determine number of unique countries
  countries<- unique(na.omit(d1$iso3_check))
  count_occ<-nrow(d1)

  # numPoints is a user define variable in runLineal, it is set to 2000
  if(count_occ >= numPoints+1){
    p<-c()
    n<-c()
    x<-data.frame()
    y<-c()
    muestra<-list()


    for(i in 1:length(countries)){
      n[i]<-nrow(d1[which(d1$iso3_check==countries[i]),])
      p[i]<-n[i]/count_occ
    }
    # this is the subsample method from install_github("DFJL/SamplingUtil")
    nsizeProp<-nstrata(n=numPoints,wh=p,method="proportional")
    smple<-list()
    for(i in 1:length(countries)){
      smple[[i]]<-sample(rownames(d1[which(d1$iso3_check==countries[i]),]), size=nsizeProp[i], replace=F)
      muestra[[i]]<-d1[smple[[i]],]

    }

    muestra<- do.call(rbind, muestra)


  }else{

    muestra<-d1
  }
  cleanData <- rbind(data.frame(muestra), dG)
  write.csv(cleanData, file = paste0(sp_dir,"/cleanedModelingData.csv"),row.names = FALSE)
  # reassign cleanPoints to represent the subsampled data
  cleanPoints <<- SpatialPointsDataFrame(coords = cleanData[,c(3,2)], data = cleanData)
  raster::crs(cleanPoints) <- raster::crs(ecoReg)
}
D:/cwrNA/src/gapAnalysis/combined/fcs_combine.R 

###
# compile insitu and exsitu summary scripts and assign priority levels 
# 20190919
# carver.dan1@gmail.com
###

fcs_combine <- function(species) {
  
  #in-situ and ex-situ summary files
  file_in <- paste0(sp_dir,"/gap_analysis/insitu/summary.csv")
  file_ex <- paste0(sp_dir,"/gap_analysis/exsitu/summary.csv")
  
  #read data from in-situ and ex-situ files
  data_in <- read.csv(file_in, sep=",", header=T)
  data_ex <- read.csv(file_ex, sep=",", header=T)
  
  #compute FCSc_min and FCSc_max
  data_comb <- data.frame(ID=species, FCSex=data_ex$FCS, FCSin=data_in$FCS)
  data_comb$FCSc_min <- min(c(data_ex$FCS,data_in$FCS),na.rm=T)
  data_comb$FCSc_max <- max(c(data_ex$FCS,data_in$FCS),na.rm=T)
  data_comb$FCSc_mean <- mean(c(data_ex$FCS,data_in$FCS),na.rm=T)
  
  #assign classes (min)
  if (data_comb$FCSc_min < 25) {
    data_comb$FCSc_min_class <- "HP"
  } else if (data_comb$FCSc_min >= 25 & data_comb$FCSc_min < 50) {
    data_comb$FCSc_min_class <- "MP"
  } else if (data_comb$FCSc_min >= 50 & data_comb$FCSc_min < 75) {
    data_comb$FCSc_min_class <- "LP"
  } else {
    data_comb$FCSc_min_class <- "SC"
  }
  
  #assign classes (max)
  if (data_comb$FCSc_max < 25) {
    data_comb$FCSc_max_class <- "HP"
  } else if (data_comb$FCSc_max >= 25 & data_comb$FCSc_max < 50) {
    data_comb$FCSc_max_class <- "MP"
  } else if (data_comb$FCSc_max >= 50 & data_comb$FCSc_max < 75) {
    data_comb$FCSc_max_class <- "LP"
  } else {
    data_comb$FCSc_max_class <- "SC"
  }
  
  #assign classes (mean)
  if (data_comb$FCSc_mean < 25) {
    data_comb$FCSc_mean_class <- "HP"
  } else if (data_comb$FCSc_mean >= 25 & data_comb$FCSc_mean < 50) {
    data_comb$FCSc_mean_class <- "MP"
  } else if (data_comb$FCSc_mean >= 50 & data_comb$FCSc_mean < 75) {
    data_comb$FCSc_mean_class <- "LP"
  } else {
    data_comb$FCSc_mean_class <- "SC"
  }
  
  #create output directory if it doesnt exist
  comb_dir <- paste0(sp_dir,"/gap_analysis/combined")
  if (!file.exists(comb_dir)) {dir.create(comb_dir)}
  
  #save output file and return
  write.csv(data_comb, paste(comb_dir,"/fcs_combined.csv",sep=""), row.names=F)
  return(data_comb)
}
D:/cwrNA/src/gapAnalysis/exsitu/ers_exsitu.r 

###
# calculates the total ecoregions within modeled area where G occurrences have
# been collected
# dan.carver@carverd.com
# 20200414
###

ers_exsitu <- function(species) {
    #load counts
    sp_counts <- read.csv(paste0(sp_dir,"/counts.csv"))

    crs(cleanPoints) <- crs(ecoReg)

    ecoVal <- data.frame(over(x = cleanPoints , y = ecoReg))%>%
      dplyr::select(ECO_ID_U )%>%
      distinct() %>%
      drop_na()

    # Clause for species with no g points, as no buffer object has been created.
    if (!file.exists(paste0(sp_dir,"/modeling/alternatives/ga50.tif"))) {
      ers <- 0
      ecoValsGLen <- NA
      ecoValsAllPointsLen <- nrow(ecoVal)
      out_df <- data.frame(ID=species, SPP_N_ECO=0, G_N_ECO=0, ERS=ers)
      write.csv(out_df,paste(sp_dir,"/gap_analysis/exsitu/ers_result.csv",sep=""),row.names=F)
    }else{
      #load g buffer
      gBuffer <- raster::raster(paste0(sp_dir,"/modeling/alternatives/ga50.tif"))
      # load in threshold model
      pa_spp <<- raster(paste0(sp_dir,"/modeling/spdist_thrsld_median.tif"))
      # this area method accounts for 0 and 1, need to replace 0 with NA values before determining the area
      pa_spp[pa_spp==0] <- NA
      # mask to native area
      gBuffer <- gBuffer * pa_spp

      #clause to test if any buffered area is within predicted area
      if(1 %in% unique(values(gBuffer))){
        # not needed because the gBuffer object only has values of 1
        gPoints <- sp::SpatialPoints(raster::rasterToPoints(gBuffer))
        # extract values from ecoregions to points
        crs(gPoints) <- crs(ecoReg)
        ecoValsG <- sp::over(x = gPoints, y = ecoReg) %>%
          distinct(ECO_ID_U )

        ecoValsGLen <- length(ecoValsG[!is.na(ecoValsG$ECO_ID_U),])

        # number of ecoRegions present in all points
        ecoValsAllPointsLen <<- nrow(ecoVal)

        #calculate ERS
        ers <- min(c(100, (ecoValsGLen/ecoValsAllPointsLen)*100))
        #create data.frame with output
        out_df <- data.frame(ID=species, SPP_N_ECO=ecoValsAllPointsLen, G_N_ECO=ecoValsGLen, ERS=ers)
        write.csv(out_df,paste(sp_dir,"/gap_analysis/exsitu/ers_result.csv",sep=""),row.names=F)

      }else{
        # clause for when no buffered area exists within distribution
        ers <- 0
        out_df <- data.frame(ID=species, SPP_N_ECO=0, G_N_ECO=0, ERS=ers)
        write.csv(out_df,paste(sp_dir,"/gap_analysis/exsitu/ers_result.csv",sep=""),row.names=F)
      }
    }
}
D:/cwrNA/src/gapAnalysis/exsitu/fcs_exsitu.r 

###
# generates the final exsitu score. srs + grs + ers / 3
# dan.carver@carverd.compare
# 20200414
###

fcs_exsitu <- function(species) {
  #load SRS, GRS, and ERS file
  if(file.exists(paste0(sp_dir,"/gap_analysis/exsitu/srs_result.csv"))){
    sp_srs <- read.csv(paste0(sp_dir,"/gap_analysis/exsitu/srs_result.csv"))
  }
  if(file.exists(paste0(sp_dir,"/gap_analysis/exsitu/grs_result.csv"))){
    sp_grs <- read.csv(paste0(sp_dir,"/gap_analysis/exsitu/grs_result.csv"))
  }
  if(file.exists(paste0(sp_dir,"/gap_analysis/exsitu/ers_result.csv"))){
    sp_ers <- read.csv(paste0(sp_dir,"/gap_analysis/exsitu/ers_result.csv"))
  }

  # clause to see if a model was successfully ran
  if(file.exists(paste0(sp_dir,"/gap_analysis/exsitu/grs_result.csv")) &
     file.exists(paste0(sp_dir,"/gap_analysis/exsitu/ers_result.csv"))){
    sp_fcs <- mean(c(sp_srs$SRS,sp_grs$GRS,sp_ers$ERS), na.rm=T)

    #assign classes (min)
    if (sp_fcs < 25) {
      score <- "HP"
    } else if (sp_fcs >= 25 & sp_fcs < 50) {
      score <- "MP"
    } else if (sp_fcs >= 50 & sp_fcs < 75) {
      score <- "LP"
    } else {
      score <- "SC"
    }
    out_df <- data.frame(ID=species, SRS=sp_srs$SRS, GRS=sp_grs$GRS,
                         ERS=sp_ers$ERS, FCS=sp_fcs, FCS_Score = score)
    #create data.frame with output
    write.csv(out_df,paste(sp_dir,"/gap_analysis/exsitu/summary.csv",sep=""),row.names=F)
  }else{
    #clause for if only SRS has be ran
    sp_fcs <- sp_srs$SRS

    #assign classes
    if (sp_fcs < 25) {
      score <- "HP"
    } else if (sp_fcs >= 25 & sp_fcs < 50) {
      score <- "MP"
    } else if (sp_fcs >= 50 & sp_fcs < 75) {
      score <- "LP"
    } else {
      score <- "SC"
    }
    out_df <- data.frame(ID=species, SRS=sp_srs$SRS, GRS=NA,
                         ERS=NA, FCS=sp_fcs, FCS_Score = score)
    #create data.frame with output
    write.csv(out_df,paste(sp_dir,"/gap_analysis/exsitu/summary.csv",sep=""),row.names=F)
  }
}
D:/cwrNA/src/gapAnalysis/exsitu/grs_exsitu.r 

###
# determines the total buffered area of g points withing modeled area
# dan.carver@carverd.com
# 20200414
###

grs_exsitu <- function(species) {
  # clause to see if any g points exist
  if(!file.exists(paste0(sp_dir,"/modeling/alternatives/ga50.tif"))){
    grs <- 0
    gBufferRas_area <- 0
    pa_spp_area <- NA
  }else{

  # load in thrsld raster
  pa_spp <<- raster(paste0(sp_dir,"/modeling/spdist_thrsld_median.tif"))
  # this area method accounts for 0 and 1, need to replace 0 with NA values before determining the area
  pa_spp[pa_spp==0] <- NA
  cell_size<- area(pa_spp, na.rm=TRUE, weights=FALSE)
  cell_size<- cell_size[!is.na(cell_size)]
  pa_spp_area <<-length(cell_size)*median(cell_size)


  # load in ga50 and model outputs
  gBufferRas <- raster::raster(paste0(sp_dir,"/modeling/alternatives/ga50.tif"))
  # mask buffer raster to the GA50 area
  gBufferRas1 <<- gBufferRas * pa_spp

  # clause to determine if any of the buffered area falls within predicted area
  if(length(unique(values(gBufferRas1)))==1 ){
    grs <- 0
    gBufferRas_area <- 0
    pa_spp_area <- NA
  }else{
      cell_size<-area(gBufferRas1, na.rm=TRUE, weights=FALSE)
      cell_size<-cell_size[!is.na(cell_size)]
      gBufferRas_area<<-length(cell_size)*median(cell_size)


      #calculate GRS
      grs <- min(c(100, gBufferRas_area/pa_spp_area*100))
    }
  }

  #create data.frame with output
  out_df <- data.frame(ID=species, SPP_AREA_km2=pa_spp_area, G_AREA_km2=gBufferRas_area, GRS=grs)
  write.csv(out_df,paste(sp_dir,"/gap_analysis/exsitu/grs_result.csv",sep=""),row.names=F)
}
D:/cwrNA/src/gapAnalysis/exsitu/srs_exsitu.R 

###
# generates the srs exsitu values based on the counts data
# dan.carver@carverd.com
# 20200414
###

srs_exsitu <- function(species) {
  # read in data
  sp_counts <<- read.csv(paste0(sp_dir,"/counts.csv"))
  # caluse for no g points
  if(sp_counts$totalGRecords >= 1 & sp_counts$totalHRecords == 0){
    srs <-100
  }

  #clause for no data
  if (sp_counts$totalGRecords == 0 & sp_counts$totalHRecords ==0) {
    srs <- 0
  } else {
    # clause for species with data
    srs <- min(c(100,sp_counts$totalGRecords/sp_counts$totalHRecords*100))
  }


  #create data.frame with output
  out_df <- data.frame(ID=species,
                       NTOTAL=sp_counts$totalRecords,
                       NTOTAL_COORDS=sp_counts$totalUseful,
                       NG= sp_counts$totalGRecords,
                       NG_COORDS=sp_counts$totalGUseful,
                       NH=sp_counts$totalHRecords,
                       NH_COORDS=sp_counts$totalHUseful,
                       SRS=srs)
  write.csv(out_df,paste0(sp_dir,"/gap_analysis/exsitu/srs_result.csv"),row.names=F)
}
D:/cwrNA/src/gapAnalysis/insitu/ers_insitu.r 

###
# calculate the ers insitu = ecoregions in protected areas / ecoregions in SDM *100
# 20200414
# dan.carver@carverd.com
###

ers_insitu <- function(species) {
    # import threshold and use it to mask protected area so only the pro in the predicted area are
    pa_spp <<- raster(paste0(sp_dir,"/modeling/spdist_thrsld_median.tif"))
    # this area method accounts for 0 and 1, need to replace 0 with NA values before determining the area
    pa_spp[pa_spp==0] <- NA
    # clause for no predicted present in thrshold raster
    if(is.na(unique(values(thrshold)))){
      ecoInSDM <-0
      ecoInProt <-
      ers <- 0
      }else{
      crs(pa_spp) <- crs(proArea)
      proNative <- raster::crop(x = proArea, y = pa_spp)
      proNative <- proNative * pa_spp

      if(length(unique(values(proNative)))==1){
        ers <- 0
        ecoInProt <- 0
        # number of ecoregion in the SDM
        thrsPoints <- sp::SpatialPoints(raster::rasterToPoints(pa_spp))
        crs(thrsPoints) <- crs(ecoReg)
        ecoVal <- data.frame(over(x = thrsPoints, y = ecoReg))%>%
          dplyr::select(ECO_ID_U )%>%
          distinct() %>%
          drop_na()

        ecoInSDM <- nrow(ecoVal)
          }else{
          `  # load in protected area maps and convert to points
            protectPoints <- sp::SpatialPoints(raster::rasterToPoints(proNative))
            # extract values from ecoregions to points
            crs(protectPoints) <- crs(ecoReg)
            ecoValsProt <- sp::over(x = protectPoints, y = ecoReg) %>%
              dplyr::select(ECO_ID_U )%>%
              distinct(ECO_ID_U ) %>%
              drop_na()
            #number of ecoRegions in protected areas
            ecoInProt <- nrow(ecoValsProt)

            # number of ecoregion in the SDM
            thrsPoints <- sp::SpatialPoints(raster::rasterToPoints(pa_spp))
            crs(thrsPoints) <- crs(ecoReg)
            ecoVal <- data.frame(over(x = thrsPoints, y = ecoReg))%>%
              dplyr::select(ECO_ID_U )%>%
              distinct() %>%
              drop_na()

            ecoInSDM <- nrow(ecoVal)

            #calculate ERS
            ers `<- min(c(100, (ecoInProt/ecoInSDM)*100))
        }
      }
        #create data.frame with output
    df <- data.frame(ID=species, SPP_N_ECO = ecoInSDM, SPP_WITHIN_PA_N_ECO = ecoInProt, ERS = ers)
    write.csv(df,paste0(sp_dir,"/gap_analysis/insitu/ers_result.csv"),row.names=F)
}
D:/cwrNA/src/gapAnalysis/insitu/fcs_insitu.r 

###
# calcualte the fcsIn = SRSin + GRSin +ERSin / 3
# dan.carver@carverd.com
# 20200414
###

fcs_insitu <- function(species) {
  #load SRS, GRS, and ERS file
  if(file.exists(paste0(sp_dir,"/gap_analysis/insitu/srs_result.csv"))){
    sp_srs <- read.csv(paste0(sp_dir,"/gap_analysis/insitu/srs_result.csv"))
  }
  if(file.exists(paste0(sp_dir,"/gap_analysis/insitu/grs_result.csv"))){
    sp_grs <- read.csv(paste0(sp_dir,"/gap_analysis/insitu/grs_result.csv"))
  }
  if(file.exists(paste0(sp_dir,"/gap_analysis/insitu/ers_result.csv"))){
    sp_ers <- read.csv(paste0(sp_dir,"/gap_analysis/insitu/ers_result.csv"))
  }
  # clause for successful model, SRSin can be ran without a modeled area
  if(file.exists(paste0(sp_dir,"/gap_analysis/insitu/grs_result.csv"))&
     file.exists(paste0(sp_dir,"/gap_analysis/insitu/ers_result.csv"))){
    sp_fcs <- mean(c(sp_srs$SRS,sp_grs$GRS,sp_ers$ERS), na.rm=T)

    #assign classes (min)
    if (sp_fcs < 25) {
      score <- "HP"
    } else if (sp_fcs >= 25 & sp_fcs < 50) {
      score <- "MP"
    } else if (sp_fcs >= 50 & sp_fcs < 75) {
      score <- "LP"
    } else {
      score <- "SC"
    }

    outDf <- data.frame(ID=species,SRS.NTOTAL = sp_srs$NTOTAL, SRS.ProTotal = sp_srs$ProTotal,
                        SRS.SRS=sp_srs$SRS, GRS=sp_grs$GRS,
                        ERS=sp_ers$ERS, FCS=sp_fcs, FCS_Score = score)
    #create data.frame with output
    write.csv(outDf,paste(sp_dir,"/gap_analysis/insitu/summary.csv",sep=""),row.names=F)

  }else{
    # clause for when only SRSin has been ran 
    sp_fcs <- sp_srs$SRS

    #assign classes (min)
    if (sp_fcs < 25) {
      score <- "HP"
    } else if (sp_fcs >= 25 & sp_fcs < 50) {
      score <- "MP"
    } else if (sp_fcs >= 50 & sp_fcs < 75) {
      score <- "LP"
    } else {
      score <- "SC"
    }
    outDf <- data.frame(ID=species,SRS.NTOTAL = sp_srs$NTOTAL,SRS.ProTotal = sp_srs$ProTotal,
                        SRS.SRS=sp_srs$SRS, GRS=NA,
                        ERS=NA, FCS=sp_fcs, FCS_Score = score)
    write.csv(outDf,paste(sp_dir,"/gap_analysis/insitu/summary.csv",sep=""),row.names=F)
  }

  #return object
  return(outDf)
}
D:/cwrNA/src/gapAnalysis/insitu/insitu_grs.r 

###
# Calculate the GRSin = area in protect areas / total area * 100
# 20200414
# dan.carver@carverd.com
###

insitu_grs = function(species) {
      #GRSin = area in protect areas / total area * 100
      # read in threshold raster
      thrshold <- raster(x = paste0(sp_dir, "/modeling/spdist_thrsld_median.tif"))
      # replace 0 with NA
      thrshold[which(thrshold[] == 0)] <- NA
      #clause for no predicted suitable habitat
      if(is.na(unique(values(thrshold)))){
        protect_area <- 0
        grs <- 0
        thrshold_area <- 0
      }else{
        # mask protect area to native area
        proNative <- raster::crop(x = proArea, y = thrshold)
        # add threshold raster to protect areas
        protectSDM <- thrshold * proNative
        writeRaster(x = protectSDM,filename = paste0(sp_dir,"/modeling/alternatives/grs_pa_PAs_narea_areakm2.tif" ), overwrite=TRUE)

        ### clause for no protected area in threshold area
        if(length(unique(values(protectSDM))) == 1){
          protect_area <- 0
          grs <- 0
          # calculate total species area
          cell_size<-area(thrshold, na.rm=TRUE, weights=FALSE)
          cell_size<-cell_size[!is.na(cell_size)]
          thrshold_area <- length(cell_size)*median(cell_size)

        }else{
          #calculated the area of cells with in protect areas within the threshold area
          protectSDM[which(protectSDM[] == 0)] <- NA
          cell_size<-area(protectSDM, na.rm=TRUE, weights=FALSE)
          cell_size<-cell_size[!is.na(cell_size)]
          protect_area <-length(cell_size)*median(cell_size)
          
          # complete for threshold predicted area
          cell_size<-area(thrshold, na.rm=TRUE, weights=FALSE)
          cell_size<-cell_size[!is.na(cell_size)]
          thrshold_area <-length(cell_size)*median(cell_size)

          #calculate GRS
          grs <- min(c(100, protect_area/thrshold_area*100))

        }
      }
    #create data.frame with output
    df <- data.frame(ID = species, SPP_AREA_km2 = thrshold_area, SPP_WITHIN_PA_AREA_km2 = protect_area, GRS = grs)
    write.csv(df,paste(sp_dir,"/gap_analysis/insitu/grs_result.csv",sep=""),row.names=F)
}
D:/cwrNA/src/gapAnalysis/insitu/srs_insitu.r 

###
# Calculate the proportion of points that fall within a protected areas found within the predicted
# model extent. Insitu SRS
# 20200303
# dan.carver@carverd.com
###

srs_insitu <- function(species) {

  totalNum <- nrow(cleanPoints)

  # read in threshold raster
  thrshold <- raster(x = paste0(sp_dir, "/modeling/spdist_thrsld_median.tif"))
  thrshold[which(thrshold[] == 0)] <- NA
  # mask protect area to native area
  proNative <- raster::crop(x = proArea, y = thrshold)
  #proNative[is.na(proNative)]<- 0
  # add threshold raster to protect areas
  protectSDM <- thrshold * proNative

  # set coodinate systems equal
  crs(cleanPoints) = crs(protectSDM)
  # run a extract to values and select all data. Test for NA, then sum true values for total number of points in protected
  #areas
  protectPoints <- sum(!is.na(raster::extract(x = protectSDM,y = cleanPoints)))

  #define SRS
  if(protectPoints >= 0 ){
    srsInsitu <- 100 *(protectPoints/totalNum)
  }else{
    srsInsitu <- 0
  }


  #create data.frame with output
  out_df <- data.frame(ID=species,
                       NTOTAL=totalNum,
                       ProTotal = protectPoints,
                       SRS=srsInsitu)
  write.csv(out_df,paste0(sp_dir,"/gap_analysis/insitu/srs_result.csv"),row.names=F)


  #return object
  return(out_df)
}
D:/cwrNA/src/gapAnalysis/insitu/srs_insitu_preModel.r 

###
# Generate a SRSin base on all points before the species is modeled. So that a metric can be provided that
# will then be replaced in the SRSin method if a species can be sucessfully modeled.
# 20200303
# dan.carver@carverd.com
###

srs_insitu_preModel <- function(species) {
  # clause for no points in north america
  if(class(spPoint) == "character"){
    #create data.frame with output
    out_df <- data.frame(ID=species,
                         NTOTAL=0,
                         ProTotal = 0,
                         SRS.SRS=0)
    write.csv(out_df,paste0(sp_dir,"/gap_analysis/insitu/srs_result.csv"),
    row.names=F)
  }else{
    totalNum <- nrow(spPoint)
    # set coodinate systems equal
    crs(spPoint) = crs(proArea)
    # run a extract to values and select all data. Test for NA,
    # then sum true values for total number of points in protected areas
    protectPoints <- sum(!is.na(raster::extract(x = proArea,y = spPoint)))
    # define SRS
    if(protectPoints >= 0 ){
      srsInsitu <- 100 *(protectPoints/totalNum)
    }else{
      srsInsitu <- 0
    }

    #create data.frame with output
    out_df <- data.frame(ID=species,
                         NTOTAL=totalNum,
                         ProTotal = protectPoints,
                         SRS.SRS=srsInsitu)
    write.csv(out_df,paste0(sp_dir,"/gap_analysis/insitu/srs_result.csv"),row.names=F)
  }
}
D:/cwrNA/src/gapAnalysis/redList/eooAoo.R 

###
# use the redListR library to calculate the EOO and AOO assessments for the species
# dan.carver@carverd.com
# 20200414
###

eooAoo <- function(species){
  # clause for low occurrence species
  if(class(spPoint)== "character" | nrow(cleanPoints@data) <= 2){
    df <- data.frame(matrix(data = NA, nrow = 1, ncol = 7))
    colnames(df) <- c("taxon", "EOO Area km2","EOO Status", "AOO",
                      "AOO adjusted Minimum", "AOO Status", "Combined Status")

    df$taxon <- species
    df$`EOO Area km2` <- NA
    df$`EOO Status`<- "Critically Endangered (CR)"
    df$AOO <- NA
    df$`AOO adjusted Minimum` <- NA
    df$`AOO Status` <- "Critically Endangered (CR)"
    df$`Combined Status` <- "Critically Endangered (CR)"
    write.csv(x = df, file = paste0(sp_dir, '/gap_analysis/redList/listingValues4kmClean.csv'))
  }else{
    if(class(cleanPoints) != "SpatialPointsDataFrame"){
      cleanPoints <- spPoint
    }

    if(file.exists(paste0(sp_dir, "/gap_analysis/redList/listingValues.csv"))){
      print('completed, moving on')
     }else{
      wgs84 <- crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
      worldEqualArea <- crs("+proj=cea +lon_0=0 +lat_ts=0 +x_0=0 +y_0=0 +ellps=WGS84 +units=m +no_defs ")
      cleanPoints2 <- SpatialPoints(coords = cleanPoints@coords, proj4string = wgs84)
      spAllPro <- spTransform(cleanPoints2, worldEqualArea)
      EOO.polygon <- makeEOO(spAllPro)
      # then calcualte the area of the bounding box
      EOO.area <- getAreaEOO(EOO.polygon)
      # determine status based on area
      if (EOO.area >= 45000){
        blo <- "Least Concern (LC)"
        eVal <- 1 }
      if (EOO.area < 45000){
        blo <- "Possible Near Threatened (NT)"
      eVal <- 2 }
      if (EOO.area < 20000){
        blo <- "Vulnerable (VU)"
        eVal <- 3} # 20000
      if (EOO.area < 5000){
        blo <- "Endangered (EN)"
        eVal <- 4} # 5000
      if (EOO.area < 100)
        {blo <- "Critically Endangered (CR)"
        eVal <- 5} # 100
      if (EOO.area == "NA"){blo <- "Critically Endangered (CR)"
      eVal <- 6}

      #EOO.area
      # this value is then use in the develop of other criteria in the sebcriterion B1

      ### Subcriterion B2 (calculating AOO)
      # create a 4 x 4 grid of to overlay on distribution.

      AOO.grid <- makeAOOGrid(spAllPro, grid.size = 4000,
                              min.percent.rule = F)
      n.AOO <- length(AOO.grid)
      AOOarea <- n.AOO * 4
      # determine status based on area
      if (AOOarea >= 4500){
        AOO_cat <- "Least Concern (LC)"
      aVal <- 1 } # <
      if (AOOarea < 4500){
        AOO_cat <- "Possible Near Threatened (NT)"
        aVal <- 2 }
      if (AOOarea < 2000){AOO_cat <- "Vulnerable (VU)"
      aVal <- 3 } # < 2000
      if (AOOarea < 500){AOO_cat <- "Endangered (EN)"
      aVal <- 4 }# < 500
      if (AOOarea < 10){AOO_cat  <- "Critically Endangered (CR)"
      aVal <- 5 }# < 10
      if (AOOarea == "NA"){AOO_cat <- "Critically Endangered (CR)"
      aVal <- 6 }


      #n.AOO
      # so the length is just the number of grid cells that overlay this environment
      # because the position of the grid cells can potential change the number of cells
      # a randomized process is used to determine a minimun number of grids.

      # 20200414, the n.AOO.improvement is set to one as a time saving measure.
      # we do not use the grid uncertainty measure in the catergorization of
      # the species, so timeliness was most important here.
      gU.results <- gridUncertainty(spAllPro, 4000,
                                    n.AOO.improvement = 1,
                                    min.percent.rule = F)

      # dataframe to save items
      df <- data.frame(matrix(data = NA, nrow = 1, ncol = 7))
      colnames(df) <- c("taxon", "EOO Area km2","EOO Status", "AOO",
                        "AOO adjusted Minimum", "AOO Status", "Combined Status")
      df$taxon <- species
      df$`EOO Area km2` <- EOO.area
      df$`EOO Status`<- blo
      df$AOO <- n.AOO * 4
      df$`AOO adjusted Minimum` <- gU.results$min.AOO.grid$AOO.number * 4
      df$`AOO Status` <- AOO_cat

      # the names for combined status
      status <- c("Least Concern (LC)","Possible Near Threatened (NT)",
                  "Vulnerable (VU)", "Endangered (EN)"
                  ,"Critically Endangered (CR)","Critically Endangered (CR)")
      # Select the lowest status and use that to define the overall status
      if(eVal >= aVal){
        stat <- status[eVal]
      }else{
        stat <- status[aVal]
      }

      df$`Combined Status` <- stat
      write.csv(x = df, file = paste0(sp_dir, '/gap_analysis/redList/listingValues4kmClean.csv'))
    }
  }
}
D:/cwrNA/src/gatherAllCode.r 

### 
# code for compiling all code into a single doc. I want to do this to seach for when elements are being defined 
# 20191212
# carver.dan1@gmail.com
### 

outLocation <- "D:/cwrNA/src"
setwd(outLocation)
files <- sort(list.files(path = outLocation, pattern = "\\.r$",full.names = TRUE,recursive = TRUE,ignore.case = TRUE))

sink(file = "textOut.txt")

for(i in 1:length(files)){
  file <- readLines(files[i])
  cat(files[i], "\n\n")
  cat(file, sep = "\n")
}

sink()
D:/cwrNA/src/master.R 

###
# Primary function to call all functions that are a part of the modeling method. 
# dan.carver@carver.com 
# 20200414
### 

master_run <- function(species){
  species <<- species
  print(paste0("the process for ", species, " has begun."))
  
  # build a datframe that captures the total run time for a process.
  time_df <- data.frame(matrix(ncol = 2, nrow = 0))
  colnames(time_df) <- c("functionUsed", "runTime")
  
  # start time for totaling run time at the end
  startTime <- Sys.time()
  
  t1a <- Sys.time()
  cat("...creating directories\n")
  create_sp_dirs(species)
  time_df <- rbind(time_df, data.frame(functionUsed="create_sp_dirs",  runTime=difftime(Sys.time(), t1a, units='secs')))
  # sp_dir <<- primary directory for this run
  
  if(file.exists(paste0(sp_dir, '/sdm.rds'))){
    print('been modeled, moving on')
    }else{
    t1a <- Sys.time()
    cat("...developing raw data\n")
    developRaw(species)
    time_df <- rbind(time_df, data.frame(functionUsed="developing raw data",  runTime=difftime(Sys.time(), t1a, units='secs')))
    # rawData <<- listing of all data for the species
    
    t1a <- Sys.time()
    cat("...generating counts csv\n")
    developCounts(species)
    time_df <- rbind(time_df, data.frame(functionUsed="testLatLong",  runTime=difftime(Sys.time(), t1a, units='secs')))
    # # dataThin <<- raw data with complete lat long and only necessary
    
    t1a <- Sys.time()
    cat("...conducting SRSex assessment\n")
    srs_exsitu(species)
    time_df <- rbind(time_df, data.frame(functionUsed="srs_exsitu",  runTime=difftime(Sys.time(), t1a, units='secs')))
    
    t1a <- Sys.time()
    cat("...creating spatial points data frame\n")
    spPoints(species)
    time_df <- rbind(time_df, data.frame(functionUsed="spPoints",  runTime=difftime(Sys.time(), t1a, units='secs')))
    #spPoint <<- all usable occurence data for NA
    
    t1a <- Sys.time()
    cat("...adding North American Points to Counts.csv\n")
    addNorthAmericanCounts(species)
    time_df <- rbind(time_df, data.frame(functionUsed="addNorthAmericanCounts",  runTime=difftime(Sys.time(), t1a, units='secs')))
    
    
    t1a <- Sys.time()
    cat("...srsIn before the modeling process\n")
    srs_insitu_preModel(species)
    time_df <- rbind(time_df, data.frame(functionUsed="srs_insitu_preModel",  runTime=difftime(Sys.time(), t1a, units='secs')))
  
    if(class(spPoint) == 'character' ){
    print("there is not locational data for this species this is the end of the modeling process")
      }else{
      cat("...extracting country values to points and remove duplicate lat long\n")
      countryCheck(species)
      time_df <- rbind(time_df, data.frame(functionUsed="countryCheck",  runTime=difftime(Sys.time(), t1a, units='secs')))
      # xyData <<- coords for elements in spPoint object
      # cleanPoints <<- spatial points layer were all points are on land.
    
      if(class(cleanPoints) == "character"){
      print('no usable records for the modeling method')
        }else{
        t1a <- Sys.time()
        cat("...spatial sample to under 2000 tota\n")
        subSampleByCountry(species)
        time_df <- rbind(time_df, data.frame(functionUsed="sampling",  runTime=difftime(Sys.time(), t1a, units='secs')))
        # cleanPoints <<- if spatail thining is applied the cleanPoint var is redeclared.
        # code needs to be ran to this location inorder to produce htmls
         
        t1a <- Sys.time()
        cat("...generate native area shp\n")
        nat_area_shp(species)
        time_df <- rbind(time_df, data.frame(functionUsed="nat_area_shp",  runTime=difftime(Sys.time(), t1a, units='secs')))
        ##nativeArea <<- ecoregions clipped to countries with species present
      
        t1a <- Sys.time()
        cat("...conducting EOO and AOO assessment\n")
        eooAoo(species)
        time_df <- rbind(time_df, data.frame(functionUsed="eooAoo",  runTime=difftime(Sys.time(), t1a, units='secs')))
      
        if(class(nativeArea)== "character"){
        print("not enough points for modelings")
          }else{
          if(nrow(cleanPoints)<= 3){
          print("not enough points for modelings")
            }else{
            
            t1a <- Sys.time()
            cat("...generate ga50Raster\n")
            create_buffers(species)
            time_df <- rbind(time_df, data.frame(functionUsed="create_buffers",  runTime=difftime(Sys.time(), t1a, units='secs')))
            
            t1a <- Sys.time()
            cat("...generate background and extract raster data to background and presence data\n")
            generateModelingData(species)
            time_df <- rbind(time_df, data.frame(functionUsed="generateModelingData",  runTime=difftime(Sys.time(), t1a, units='secs')))
            # bioValues <<- Presence and background points with predictor data attached
              
            t1a <- Sys.time()
            cat("...perform variable selection and correlation\n")
            varaibleSelection(species)
            time_df <- rbind(time_df, data.frame(functionUsed="varaibleSelection",  runTime=difftime(Sys.time(), t1a, units='secs')))
            # bioValues <<- redefined if any NA are present in predictor datasets.
            # variblesToModel <<- a listing of variable names used to modeling
               
            t1a <- Sys.time()
            cat("...perform maxent model\n")
            runMaxnet(species)
            time_df <- rbind(time_df, data.frame(functionUsed="runMaxnet",  runTime=difftime(Sys.time(), t1a, units='secs')))
            # sdm_results <<- output of the Maxnet modeling process
            
            if(!file.exists(paste0(sp_dir, "/sdm.rds"))){
            print("the model did not successfully run")
              }else{
              
              t1a <- Sys.time()
              evaluate_sdm_function(species)
              time_df <- rbind(time_df, data.frame(functionUsed="evaluate_sdm_function",  runTime=difftime(Sys.time(), t1a, units='secs')))
              ## thrshold <<- threshold raster
              
              if(length(unique(values(thrshold)))==2){
              print("the model did not successfully run")
                }else{
                
                t1a <- Sys.time()
                cat("...create a mess map based on top predictor \n")
                messMap(species)
                time_df <- rbind(time_df, data.frame(functionUsed="messMap",  runTime=difftime(Sys.time(), t1a, units='secs')))
                
                t1a <- Sys.time()
                cat("...create a kernal density map of the sample points  \n")
                kernalDensity(species)
                time_df <- rbind(time_df, data.frame(functionUsed="kernalDensity",  runTime=difftime(Sys.time(), t1a, units='secs')))
                  
                ### start of the gap analysis metrics.
                t1a <- Sys.time()
                cat("...conducting GRSex assessment\n")
                grs_exsitu(species)
                time_df <- rbind(time_df, data.frame(functionUsed="grs_exsitu",  runTime=difftime(Sys.time(), t1a, units='secs')))
                
                t1a <- Sys.time()
                cat("...conducting ERSex assessment\n")
                ers_exsitu(species)
                time_df <- rbind(time_df, data.frame(functionUsed="ers_exsitu",  runTime=difftime(Sys.time(), t1a, units='secs')))
                                    
                t1a <- Sys.time()
                cat("...conducting  fcsex assessment\n")
                fcs_exsitu(species)
                time_df <- rbind(time_df, data.frame(functionUsed="fcs_exsitu",  runTime=difftime(Sys.time(), t1a, units='secs')))
                                    
                cat("...Ex situ assessment is complete\n")
                
                t1a <- Sys.time()
                cat("...conducting srs In assessment\n")
                srs_insitu(species)
                time_df <- rbind(time_df, data.frame(functionUsed="srs_insitu",  runTime=difftime(Sys.time(), t1a, units='secs')))
                
                t1a <- Sys.time()
                cat("...conducting grs In assessment\n")
                insitu_grs(species)
                time_df <- rbind(time_df, data.frame(functionUsed="insitu_grs",  runTime=difftime(Sys.time(), t1a, units='secs')))
                
                t1a <- Sys.time()
                cat("...conducting ers In assessment\n")
                ers_insitu(species)
                time_df <- rbind(time_df, data.frame(functionUsed="ers_insitu",  runTime=difftime(Sys.time(), t1a, units='secs')))
                
                t1a <- Sys.time()
                cat("...conducting fcs In assessment\n")
                fcs_insitu(species)
                time_df <- rbind(time_df, data.frame(functionUsed="fcs_insitu",  runTime=difftime(Sys.time(), t1a, units='secs')))
                
                cat("...in situ assessment is complete\n")
                
                t1a <- Sys.time()
                cat("...conducting fcs combined assessment\n")
                fcs_combine(species)
                time_df <- rbind(time_df, data.frame(functionUsed="fcs_combine",  runTime=difftime(Sys.time(), t1a, units='secs')))
                                   
                t1a <- Sys.time()
                rmarkdown::render(paste0(repo_dir, "/summaryMarkdown/singleSpeciesSummary.rmd"),
                                      output_file =  paste("report_", species, '_', run_version,'_' , Sys.Date(), ".html", sep=''),
                                      output_dir = paste0(gap_dir,"/", genus,"/summaryDocs"))
                
                time_df <- rbind(time_df, data.frame(functionUsed="sinlge species Summary",  runTime=difftime(Sys.time(), t1a, units='secs')))
                time_df$Minutes <- time_df$runTime/60
                write.csv(x = time_df, file = paste0(sp_dir, "/time_df.csv"))
                print(paste0("the process for ", species, " has ended."))
                
                
                # remove all global variables that are specific to the species
                try(globalVars <- c(xyData, cleanPoints, sp_dir,rawData,dataThin, nativeArea, spPoint,pa_spp,
                                    ecoValsAllPointsLen,pa_spp_area, gBufferRas1, gBufferRas_area,
                                    sp_counts,bioValues,sdm,evaluate_table,sdm_results, thrshold,
                                    evaluate_table_f, crossValDir,rastersToModel, data_train, bioValues,
                                    variblesToModel,species))
                
                try(rm(globalVars))
                beepr::beep(1)
              }
            }
          }
        }
      }
    }
  }
}


D:/cwrNA/src/modeling/alternative/create_buffers.r 

###
# buffer all g points and clip to native area
# dan.carver@carverd.com
# 20200414
###

create_buffers <- function(species){
    ## select all g points from point object
    p1 <- subset(cleanPoints, type == "G")
    # ensure matching CRS
    raster::crs(p1) <- raster::crs(nativeArea)

    #clause to test for G occurrences
    if(nrow(p1@data)== 0){
      print("there are no g points for this species")
    }else{
    ##buffering
    buffer <- geobuffer_pts(xy = p1,
                              dist_m = bufferDist,
                              output = 'sf')
    # set extent equal to native area
    rasters1 <- bioVars$as.RasterStack() %>%
      raster::crop(nativeArea) %>%
      raster::mask(nativeArea)

    ##rasterizing and matching cells to predictor layers
    buffer_rs <- fasterize::fasterize(buffer, rasters1$layer.1)

    # mask buffer to native area
    maskBuff <- raster::crop(x = buffer_rs, y = nativeArea) %>%
      raster::mask(nativeArea)

    ##writing raster
    writeRaster(maskBuff, paste0(sp_dir,"/modeling/alternatives/ga50.tif	"),overwrite=TRUE)
  }
}
D:/cwrNA/src/modeling/maxnet/evaluate_function.r 

###
# runs statistics on the predictive capability of the median model
# Based on work by the CIAT group
# dan.carver@carverd.com
# 20200414
###

evaluate_function <- function(species){

  ###ASD15
  esdCpt <- raster(paste0(sp_dir, "/modeling/",species,"_prj_std.tif"))
  dumm <- raster(paste0(sp_dir, "/modeling/spdist_thrsld_median.tif"))

  esdCpt[which(dumm[] < 0.001)] <- NA

  #create 0,1 raster with areas below 0.15 STD (below=1, above=0)
  esdCpt_ref <- esdCpt
  esdCpt_ref[which(!is.na(esdCpt[]))] <- 1

  #create 0,1 raster with areas above 0.15 STD (below=0, above=1)
  esdCpt_a15 <- esdCpt
  esdCpt_a15[which(esdCpt[] >= 0.15)] <- 1
  esdCpt_a15[which(esdCpt[] < 0.15)] <- 0

  #make a raster of area
  dist_area <- area(esdCpt)

  #calculate size of distribution within native area, and within thresholded distribution
  #total, and above 0.15 STD.
  szCpt <- dist_area * esdCpt_ref
  szCptUncertain <- dist_area * esdCpt_a15
  rateCpt <- sum(szCptUncertain[],na.rm=T) / sum(szCpt[],na.rm=T) * 100

  #############################
  rm(dumm,esdCpt);gc()
  #############################

  evaluate_table_f <- data.frame(matrix(nrow = 1,ncol = 16))
  evaluate_table_f[1,] <- colMeans(evaluate_table[,-10],na.rm = T)
  colnames(evaluate_table_f) <- names(colMeans(evaluate_table[,-10],na.rm = T))
  #SP
  evaluate_table_f[,"species"]<- as.character(species)
  evaluate_table_f[,"STAUC"]<- as.numeric(sd(evaluate_table$AUCtrain,na.rm = T))
  colnames(evaluate_table_f)[1] <- "ATAUC"

  ####ASD15
  evaluate_table_f$ASD15 <- NA
  if(is.na(rateCpt)){
    evaluate_table_f$ASD15 <- 100
  } else {
    evaluate_table_f$ASD15 <- rateCpt
  }

  if (evaluate_table_f$ATAUC >=0.7 &
      evaluate_table_f$STAUC <0.15 &
      evaluate_table_f$ASD15 <=10 &
      evaluate_table_f$cAUC >=0.4
  ) {
    evaluate_table_f$VALID  <-TRUE
  } else {
    evaluate_table_f$VALID  <-FALSE
  }

  #write output table
  write.csv(evaluate_table_f,paste0(crossValDir,"/","eval_metrics.csv"),quote = F,row.names = F)
}
D:/cwrNA/src/modeling/maxnet/evaluate_sdm_function.r 

###
# generates the median threshold raster. Calls evaluateFunction to access the
# statistics on the median threshold model.
# Based on work by the CIAT group
# dan.carver@carverd.com
# 20200414
###

evaluate_sdm_function <- function(species){

    if(file.exists(paste0(sp_dir,"/","sdm.rds"))){

      cat("Loading sdm results!", "\n")

      sdm <<- readRDS(paste0(sp_dir,"/","sdm.rds"))
      # Extracting metrics for 5 replicates
      cat("Gathering replicate metrics  for: ", species, "\n")
      evaluate_table <<- metrics_function(species)
      #evaluate_table<-read.csv(paste0(sp_dir,"/","eval_metrics_rep.csv"),header=T)

      # Apply threshold from evaluation
      cat("Thresholding using Max metrics  for: ", species, "\n")
      # thrsld <- as.numeric(mean(evaluate_table[,"Threshold"],na.rm=T))
      thrsld <- as.numeric(mean(evaluate_table[,"threshold_train"],na.rm=T))
      if (!file.exists(paste0(sp_dir, "/modeling/spdist_thrsld_median.tif"))) {
        # spThrsld <- spMedian
        spThrsld <- raster(paste0(sp_dir,"/modeling/",species,"_prj_median.tif"))
        spThrsld[which(spThrsld[] >= thrsld)] <- 1
        spThrsld[which(spThrsld[] < thrsld)] <- 0
        raster::writeRaster(x = spThrsld, filename = paste0(sp_dir, "/modeling/spdist_thrsld_median.tif"),overwrite = TRUE)
          } else {
        spThrsld <- raster(paste0(sp_dir, "/modeling/spdist_thrsld_median.tif"))
      }
      thrshold<<- spThrsld
      if(nrow(cleanPoints)>=3){
        # Gathering final evaluation table
        evaluate_table_f <<- evaluate_function(species)
        #return(cat("Process finished successfully for specie:", species, "\n"))

      } else { #if(base::nrow(cleanPoints)<10 & base::nrow(cleanPoints)>0  ) {
        cat("Species:", species, "only has", nrow(cleanPoints), "coordinates, it is not appropriate for modeling\n")

        evaluate_table_f <- data.frame(ATAUC=NA,AUCtest=NA,nAUC=NA,cAUC=NA,sensi_train=NA,speci_train=NA,threshold_train=NA,
                                       max.TSS_train=NA,minROCdist_train=NA,threshold_test=NA,sensi_test=NA,speci_test=NA,matthews.cor_test=NA,
                                       LR_pos_test=NA,LR_neg_test=NA,kappa_index_test=NA,species=NA,STAUC=NA,ASD15=NA,VALID=NA)

        evaluate_table_f[,"VALID"] <- FALSE
        evaluate_table_f[,"species"] <- species
        write.csv(evaluate_table_f, paste0(sp_dir,"/","eval_metrics.csv"),row.names=F,quote=F)

      }
    }else {   cat(paste(species," not modelled yet"),"\n")

      if(nrow(cleanPoints)<3){

        cat("Species:", species, "only has", nrow(cleanPoints), "coordinates, it is not appropriate for modeling\n")

        evaluate_table_f <- data.frame(ATAUC=NA,AUCtest=NA,nAUC=NA,cAUC=NA,sensi_train=NA,speci_train=NA,threshold_train=NA,
                                       max.TSS_train=NA,minROCdist_train=NA,threshold_test=NA,sensi_test=NA,speci_test=NA,matthews.cor_test=NA,
                                       LR_pos_test=NA,LR_neg_test=NA,kappa_index_test=NA,species=NA,STAUC=NA,ASD15=NA,VALID=NA)

        evaluate_table_f[,"VALID"] <- FALSE
        evaluate_table_f[,"species"] <- species
        write.csv(evaluate_table_f, paste0(sp_dir,"/","eval_metrics.csv"),row.names=F,quote=F)
      } else {
        cat(paste(species," not modelled yet"),"\n")
      }
    }

  return(species)
}
D:/cwrNA/src/modeling/maxnet/generateModelingData.r 

###
# generate background points based on the size of the native area
# 20190904
# dan.carver@carverd.com
###

generateModelingData <- function(species){
  #function for checking area
  numberBackground <- function(area){
    n <- rgeos::gArea(area)*58
    if( n >= 5000){
      n <- 5000}else{
        n <- n
      }
    return(n)
  }
  n <- numberBackground(nativeArea)

  # produce background points based on native area area
  bck_data <- spsample(nativeArea, n = n, type = "random" )
  crs(bck_data) <- crs(nativeArea)
  print(1)
  # test so that background points do not overlap with presence points
  # create the point buffer
  # 1. buffer values from known presece locations by 0.000556
  presBuff <- rgeos::gBuffer(sp::SpatialPoints(cleanPoints@coords), width=0.000556) #width=0.000556
  crs(presBuff) <- crs(nativeArea)
  # convert to spatial dataframe
  print(2)
  # 2. run an intersect between buffer and background point
  intersect <- data.frame(over(bck_data, presBuff))

  if(length(unique(intersect$over.bck_data..presBuff.))>1){
    nbd <- as.data.frame(bck_data@coords)
    nbd$intesect <- intersect
    nbd <- nbd %>%
      filter(is.na(intersect))
    bck_data <- sp::SpatialPoints(coords = c(nbd[,1:2]))
    crs(bck_data) <- crs(nativeArea)
  }
  print(3)
  # 3. extract all values to background points
  rasterStack <- bioVars$as.RasterStack() %>%
    raster::crop(nativeArea)
  bck_vals <- raster::extract(x = rasterStack,y = bck_data)
  bck_data_bio <-as.data.frame(cbind(bck_data@coords, bck_vals))%>%
    mutate(presence = 0)
  bck_data_bio$longitude <- bck_data_bio$x
  bck_data_bio$latitude <- bck_data_bio$y
  bck_data_bio <- bck_data_bio %>% dplyr::select(-c("x","y"))
  print(4)
  # extract values to presence points
  prs_vals <- raster::extract(x = rasterStack,y = sp::SpatialPoints(cleanPoints@coords))
  print(4.1)
  prs_data_bio <- as.data.frame(cbind(prs_vals,cleanPoints@data[,2:3])) %>%
    mutate(presence = 1)
  print(5)
  # merge presence and background sets
  bioValues <<- dplyr::bind_rows(bck_data_bio,prs_data_bio)

  # write out csv of background/presence data
  write.csv(x = bioValues, file = paste0(sp_dir, "/occurrences/presBackgroundWithBiovars.csv"))
}
D:/cwrNA/src/modeling/maxnet/kernalDensity.R 

###
# Function to generate a kernal density map of the occurence data for CWR NA.
# This is a visualization tool that will help in understanding the potential
# spatial biasing in the occurrence dataset. I want to be able to highlight
# areas of high sampling density and areas of low density. A possible extension
# of this it attempting to smooth the varability acroos the area,
# by adjusting the number of points in a region, though that will be a
# complicated process.
# 20200414
# dan.carver@carverd.com
###

kernalDensity <- function(species){
  # so both cleanPoints and thrshold are both global objects so I should need to call them
  k2 <- spatialEco::sp.kde(x = cleanPoints, newdata = thrshold, standardize = TRUE)
  raster::writeRaster(x = k2, filename = paste0(sp_dir, "/modeling/kernalDensity.tif"),overwrite=TRUE)

  # potential work flow for test idea
  # extract kernal values back to cleanPoints
  # filter out a portion of the points with high kernal density
  # re run KDE
  # repeat until a range the distribution of the kernal density values reachs
  # a specific level of normal? or some other metric(range of KDE values)
  # rerun the modeling methodology and compare outputs 
  }
D:/cwrNA/src/modeling/maxnet/messMap.R 

###
# MESS Map : filtered model extent by range of top predictor
# 20200414
# dan.carver@carverd.com
###

messMap <- function(species) {
  # Load in top predictors 
  topPre <- read.csv(file = paste0(sp_dir,"/modeling/maxent/predictorImportance.csv" ))

  # load in bioValuesFor Predictors
  bioValue2 <- read.csv(file = paste0(sp_dir,"/modeling/maxent/bioValuesForPresencePoints.csv" ))%>%
    dplyr::filter(presence == 1)

  # use top predictor to select raster layer and clip that to native area
  top1 <- as.character(topPre$varNames[1])

  # determine range +/- 1% of top predictor
  vals1 <- dplyr::select(bioValue2, top1)
  r1 <- range(vals1)
  #diff <- (r1[2] - r1[1])*0.01 # 20191031, dropping the buffer section for now.
  h1 <- r1[2] #+ diff
  b1 <- r1[1] #- diff

  # load in raster for top predictor
  topRast <- rastersToModel%>%
    raster::subset(top1)


  # write a clause statement that test identifies all areas within that range of native range masked
  topRast[topRast > h1] <- 0
  topRast[topRast < b1] <- 0
  topRast[topRast > 0] <- 1
  # use the newly created raster to mask out the the threshold model. Output will be presented in the htmls
  thrshold <<- raster::raster(paste0(sp_dir, "/modeling/spdist_thrsld_median.tif"))
  thrshold[thrshold == 0] <- NA
  messMap <- thrshold * topRast
  joinMEss <- thrshold + messMap
  raster::writeRaster(x = messMap, filename = paste0(sp_dir, "/modeling/messMapThres.tif"),overwrite=TRUE)
}
D:/cwrNA/src/modeling/maxnet/metricsFuction.R 

###
# compiles individual model run data
# built on content from CIAT group
# dan.carver@carver.com
# 20200414
###

metrics_function<-function(species){

  evaluate_table <- data.frame(
    AUCtrain = do.call(rbind,sdm$AUC_train),
    AUCtest = do.call(rbind,sdm$AUC),
    nAUC = do.call(rbind,sdm$nAUC),
    cAUC = do.call(rbind,sdm$cAUC)
  )
  atrain <- do.call(rbind,sdm$evaluation_train)
  colnames(atrain) <- paste0(colnames(atrain),"_","train")

  atest <- do.call(rbind,sdm$evaluation_test)
  colnames(atest) <- paste0(colnames(atest),"_","test")

  evaluate_table <- cbind(evaluate_table,atrain)
  evaluate_table <- cbind(evaluate_table,atest)

  crossValDir <<- paste0(sp_dir, "/modeling/maxent")
  write.csv(evaluate_table,paste0(crossValDir,"/","eval_metrics_rep.csv"),quote = F,row.names = F)

  return(evaluate_table)
}
D:/cwrNA/src/modeling/maxnet/runMaxnet.R 

###
# run  maxnet, include validations and projections to rasters
# based on work by CIAT group
# 20200414
# dan.carver@carverd.com
###

runMaxnet <- function(species){
    tryCatch({
        # pull out presence points
        nPresence <- bioValues %>%
          filter(presence == 1) %>% nrow()

        if(nPresence <= 8 & nPresence >3){
             kfold <- 3
             feat <- "lp"
        }
        if(nPresence > 8 & nPresence <= 25){
          kfold <- 6
          feat <- "lqph"
        }
        if(nPresence > 25){
          kfold <- 10
          feat <- "lqph"
        }


          # select needed raster bands
          rastersToModel <<- bioVars$as.RasterStack() %>%
            raster::subset(names(variblesToModel))%>%
            raster::crop(nativeArea)%>%
            raster::mask(nativeArea)
          # develop modeling data
          bioValuesModel <- bioValues[complete.cases(bioValues),] %>%
            dplyr::select(names(variblesToModel)) %>%
            dplyr::mutate(presence = bioValues$presence, latitude = bioValues$latitude, longitude = bioValues$longitude)

          ####
          # I'm wrapping the modeling steps into a function because there are species which
          # can not be models via all reps because when the data is spilts, specific groups
          # do not have any presences points. If you re run the cvfolds process, you get new
          # groups of data, this sometimes produces splits in the data that allow for the
          # process to run, hence we get a successful model.
          modelsteps <- function(){
            # run model within here
            cvfolds <- modelr::crossv_kfold(bioValuesModel,k = kfold)
            #### 20200127, trying to get a consistent test train split. I will try running the
            # models with this method and see what happen.

            ### 20200128, not going to work out until i figure out how to index
            # ## Create a dataframe that records the number of test and train
            # paCount <- data.frame(matrix(nrow = kfold, ncol = 5))
            # colnames(paCount) <- c("modelRun", "presenceTrain",
            #                        "backgroundTrain", "presenceTest","backgroundTest")
            # for(i in 1:kfold){
            #   paCount$modelRun[i] <- i
            #   paCount$presenceTrain[i] <- length(which(cvfolds$train[i]$data$presence == 1))
            #   paCount$backgroundTrain[i] <- length(which(cvfolds$train[i]$data$presence == 1))
            #   paCount$presenceTest[i] <- length(which(cvfolds$test[i]$data$presence == 1))
            #   paCount$backgroundTest[i] <- length(which(cvfolds$test[i]data$presence == 1))
            # }
            #
            sdm_results <- cvfolds %>% dplyr::mutate(.
                                                      #train sdm models using Maxnet and train data
                                                      ,model_train = purrr::map2(.x = train, .y = .id, function(.x, .y){

                                                        cat("Training MAXNET model for fold", .y, ", all presence points added to background \n")

                                                        data_train <<- as.data.frame(.x)
                                                        #select all presence and add them as background as well.
                                                        pres <- data_train%>% filter(presence == 1)
                                                        pres$presence <- rep(0, length(pres$presence))
                                                        data_train <- rbind(data_train, pres)
                                                        p <- data_train$presence
                                                        # cat(print(p),"\n")
                                                        data <- data_train %>% dplyr::select(names(variblesToModel))
                                                        fit.maxent <- maxnet::maxnet(p       = p,
                                                                                     data    = data,
                                                                                     #regmult = beta,
                                                                                     f       = maxnet.formula(p, data, classes = feat))

                                                        return(fit.maxent)

                                                      })

                                                      #evaluate trained model
                                                      , predictions_train = purrr::pmap(list(.x = model_train, .y = .id, .z = train), function(.x, .y, .z){
                                                        cat("Predicting train data for fold", .y, "\n")
                                                        train <- as.data.frame(.z)
                                                        predictions <- raster::predict(object = .x,
                                                                                       newdata = train%>% dplyr::select(names(variblesToModel)),
                                                                                       type = "logistic")
                                                        dt <-  data.frame(obs = factor(train$presence), pred = predictions)


                                                        return(dt)
                                                      })
                                                      #calculate auc for trained model
                                                      ,AUC_train = purrr::map2(.x = predictions_train, .y = .id, function(.x, .y){
                                                        cat("Calculating AUC_train for model", .y,"\n")
                                                        croc <- pROC::roc(response = .x$obs, predictor = .x$pred)

                                                        return(as.numeric(croc$auc))
                                                      })
                                                      #calculate max preformance measures (sensitivity, specificity and Treshold) using train data
                                                      ,evaluation_train = purrr::map2(.x = predictions_train, .y = .id, function(.x, .y){

                                                        cat("Calculating optimal threshold for model", .y, "\n")
                                                        croc <- pROC::roc(response = .x$obs, predictor = .x$pred)
                                                        croc_summ <- data.frame (sensi = croc$sensitivities, speci = croc$specificities, threshold =  croc$thresholds) %>%
                                                          round(., 3) %>%
                                                          dplyr::mutate(., max.TSS = sensi + speci - 1) %>%
                                                          dplyr::mutate(., minROCdist = sqrt((1- sensi)^2 + (speci -1)^2))

                                                        max.tss <- croc_summ %>% dplyr::filter(., max.TSS == max(max.TSS)) %>%
                                                          dplyr::mutate(., method = rep("max(TSS)", nrow(.)))

                                                        minRoc <- croc_summ %>%
                                                          dplyr::filter(., minROCdist == min(minROCdist))%>%
                                                          dplyr::mutate(., method = rep("minROCdist", nrow(.)))

                                                        croc_summ <- rbind(max.tss, minRoc) %>%
                                                          dplyr::filter(., speci == max(speci))  %>%
                                                          dplyr::sample_n(., 1)

                                                        return(croc_summ)
                                                      })
                                                      #Make predictions using testing data
                                                      , predictions_test = purrr::pmap(list(.x = test, .y = model_train, .z = .id), function(.x, .y, .z){

                                                        cat("Using test data to predict model", .z," \n")
                                                        test <- as.data.frame(.x)
                                                        predictions <- raster::predict(object = .y,
                                                                                       newdata = test%>% dplyr::select(names(variblesToModel)),
                                                                                       type = "logistic")
                                                        dt <-  data.frame(obs = factor(test$presence), pred = predictions)

                                                        return(dt )
                                                      })
                                                      #Calculate AUC for testing
                                                      , AUC = map2(.x = predictions_test, .y = .id, function(.x, .y){
                                                        cat("Calculating AUC for model", .y,"\n")
                                                        croc <- pROC::roc(response = .x$obs, predictor = .x$pred)

                                                        return(as.numeric(croc$auc))
                                                      })
                                                      #calculate max preformance measures (sensitivity, specificity and Treshold) using max(TSS) criterion
                                                      , evaluation_test = pmap(list(.x = evaluation_train, .y = .id, .z = predictions_test), function(.x, .y, .z){
                                                        cat("Calculating evaluation for model", .y,"\n")

                                                        thr <- .x$threshold

                                                        a <- .z %>% dplyr::filter(., pred >= thr & obs == 1) %>% nrow()
                                                        b <- .z %>% dplyr::filter(., pred >= thr & obs == 0) %>% nrow()
                                                        c <- .z %>% dplyr::filter(., pred < thr & obs == 1) %>% nrow()
                                                        d <- .z %>% dplyr::filter(., pred < thr & obs == 0) %>% nrow()

                                                        #senitivity and specificity
                                                        se <- a/(a+c)
                                                        es <- d/(b+d)
                                                        #Matthews correlation coefficient
                                                        den <- sqrt(a+b)*sqrt(a+c)*sqrt(d+b)*sqrt(d+c)
                                                        den <- ifelse(den  != 0 ,den, 1 )
                                                        mcc <- (a*d - b*c)/den
                                                        #Likelyhood Ratio +
                                                        lr_ps <- se/(1 - es)
                                                        #Likelihood ratio -
                                                        lr_ne <- (1 - se)/es

                                                        #calculate kappa index
                                                        pr_a <- (a+d)/(a+b+c+d)
                                                        pr_e <- (((a+b)/(a+b+c+d))* ((a+c)/(a+b+c+d))) + ( ((c+d)/(a+b+c+d) )* ((b+d)/(a+b+c+d) ))
                                                        kappa <- (pr_a - pr_e)/(1 - pr_e)


                                                        evaluation <- data.frame(threshold= thr, sensi = se, speci = es, matthews.cor = mcc, LR_pos = lr_ps, LR_neg = lr_ne, kappa_index = kappa)
                                                        return(evaluation)
                                                      })
                                                      #Calculate nAUC using both train and test data
                                                      , nAUC = pmap(list(.x = train, .y = test, .z = .id), function(.x, .y, .z){
                                                        cat("calculating AUC from NULL model", .z,"\n")

                                                        train_dt <- as.data.frame(.x) %>% dplyr::select(., presence, longitude, latitude)
                                                        test_dt  <- as.data.frame(.y) %>% dplyr::select(., presence, longitude, latitude)


                                                        train_p <- train_dt[which(train_dt$presence == 1), 2:3]
                                                        train_a <- train_dt[which(train_dt$presence == 0), 2:3]

                                                        gd <- dismo::geoDist(p = train_p, a = train_a, lonlat=TRUE)
                                                        pred <- dismo::predict(gd, test_dt %>% dplyr::select(longitude, latitude))

                                                        nAUC <- pROC::roc(response = test_dt$presence, predictor = pred)
                                                        return(as.numeric(nAUC$auc))
                                                      })
                                                      #Calculate cAUC using the formula cAUC = AUC + 0.5 - max( 0.5, nAUC)
                                                      , cAUC = purrr::pmap(list(.x = AUC, .y = nAUC, .z = .id), function(.x, .y, .z){
                                                        cat("Calculating AUC correction using NULL model", .z, " \n")
                                                        cAUC = .x + 0.5 - max( 0.5, .y)
                                                        return(cAUC)
                                                      })
                                                      #Project rasters using maxnet model for mean, median and sd
                                                      , do.projections =  purrr::pmap(list(.x = model_train, .y = .id, .z = evaluation_train) ,function(.x, .y, .z){

                                                        cat(">>> Proyecting MAXNET model", .y,"to a raster object \n")
                                                        r <- raster::predict(rastersToModel, .x, type = "logistic", progress='text')
                                                        writeRaster(r, paste0(sp_dir,"/modeling/replicates/",species,"_prj_rep-", .y,".tif"), format="GTiff", overwrite = TRUE)
                                                        #thresholding raster
                                                        # if(!validation){
                                                        #   r[which(r[] < .z$threshold)] <- NA
                                                        # }
                                                        writeRaster(r, paste(sp_dir,"/modeling/replicates/",species,"_prj_th_rep-", .y,".tif",sep=""), format="GTiff", overwrite = TRUE)
                                                        return(r)
                                                      })


            )#end mutate
            return(sdm_results)
          }
          ## set up while loop to test for itorations and successful run via the creation of
          # the sdm object.
          sdm_results <- NULL
          attempt <- 1
          while(is.null(sdm_results) && attempt <=3){
            attempt <- attempt + 1
            try(
              sdm_results <- modelsteps()
              )# end try
          }# end while loop
          sdm_results <<- sdm_results
          #calculate  mean, median and sd raster from replicates
          prj_stk <- sdm_results %>% dplyr::select(., do.projections) %>% unlist() %>% raster::stack() %>% raster::mask(nativeArea)
          cat("Calculating mean, median and sd for replicates \n")
          mean(prj_stk) %>% writeRaster(., paste0(sp_dir,"/modeling/", species, "_prj_mean.tif" ), overwrite = TRUE)
          cat("Mean raster calculated \n")
          raster::calc(prj_stk, fun = function(x) {median(x)}) %>% writeRaster(., paste0(sp_dir,"/modeling/", species, "_prj_median.tif" ), overwrite = TRUE)
          cat("Median raster calculated \n")
          raster::calc(prj_stk, fun = function(x) {sd(x)}) %>% writeRaster(., paste0(sp_dir,"/modeling/", species, "_prj_std.tif" ), overwrite = TRUE)
          cat("Sd raster calculated \n")


          #save all results in an .rds file
          cat("Process Done... Saving results as .rds file in the path", paste0(sp_dir, "/sdm.rds"), " \n")
          saveRDS(sdm_results, paste0(sp_dir, "/sdm.rds"));gc()

          cat(" ","\n")
          cat("Maxent model finished and saved","\n")
          cat(" ","\n")
          return(sdm_results)
      }
      # ... but if an error occurs, tell me what happened:
      , error=function(error_message) {
        message("This species encountered an error it will be added to a list to evalualte later")
        message("And below is the error message from R:")
        message(error_message)
        return(NA)
      }
    )
}
D:/cwrNA/src/modeling/maxnet/variableSelection.R 

###
# preform a variable selection on the data generates input dataset for modeling
# methods
# 20190904
# dan.carver@carverd.com
###

varaibleSelection <- function(species){
  # subset predictor data and presence column
  varSelect <- bioValues %>% dplyr::select(-c(longitude,latitude ))
  # remove all na from dataframe
  test2 <-complete.cases(varSelect)
  varSelect <- varSelect[test2,]
  # drop all column from bioValues set as well so the same data is used for maxnet modeling.
  bioValues <<- bioValues[test2,]
  write.csv(x = bioValues, file = paste0(sp_dir, "/modeling/maxent/bioValuesForPresencePoints.csv"))


  # # #vsurf
  ### Considered altering the number of trees, 100 is somewhat low for the
  # number of predictors used. It was a time concern more then anything.
  vsurfThres <- VSURF_thres(x=bioValues[,1:26] , y=as.factor(bioValues$presence) ,
                            ntree = 100 )

  ###
  #correlation matrix
  ###

  # define predictor list based on Run
  inputPredictors <- vsurfThres$varselect.thres

  # ordered predictors from our variable selection
  predictors <- varSelect[,c(inputPredictors)]
  # Calculate correlation coefficient matrix
  correlation <-cor(predictors, method="pearson")
  #change self correlation value

  # #define the list of top 15 predictors
  varNames <- colnames(correlation)
  # empty list containing the variables tested
  varsTested <- c()
  #loop through the top 5 predictors to remove correlated varables.
  for( i in 1:5){
    print(varNames[i])
    if(varNames[i] %in% varNames){
      # add variable to the test list
      varsTested <- c(varsTested, varNames[i])
      # Test for correlations with predictors
      vars <- correlation[(i+1):nrow(correlation),i] > 0.7 | correlation[(i+1):nrow(correlation),i] < -0.7
      # Select correlated values names
      corVar <- names(which(vars == TRUE))
      #test is any correlated variables exist
      if(length(corVar) >0 ){
        # loop through the list of correlated variables
        varNames <- varNames[!varNames  %in% corVar]
        print(paste0("the variable ", corVar, " was removed"))
      }
    }else{
      print("this variable has been removed already")
    }
  }

  # include all variables that were tested.
  for(p in varsTested){
    if(p %in% varNames){
    }else{
      varNames <- c(varNames, p)
    }
  }# It's a little bit confusing why variable are being dropped after they area tested. Correlation
  # should be the same in both directs. This is just a test to make sure it works.


  #create a dataframe of the top predictors and
  rankPredictors <- data.frame(matrix(nrow = length(colnames(correlation)),ncol = 3))
  rankPredictors$varNames <- colnames(correlation)
  rankPredictors$importance <- vsurfThres$imp.varselect.thres
  rankPredictors$includeInFinal <- colnames(correlation) %in% varNames
  rankPredictors <- rankPredictors[,4:6]
  write.csv(x = rankPredictors, file = paste0(sp_dir, "/modeling/maxent/predictorImportance.csv"))

  variblesToModel <<- varSelect[,varNames]
}
D:/cwrNA/src/run_lineal.R 

### 
# Primary script for running CWRNA. This should be the only location where users have
# to edit information.
# 20200414
# dan.carver@carverd.com
### 

library(tidyverse)
library(sp)
library(raster)
library(rgdal)
library(tmap)
library(devtools)
#install_github("DFJL/SamplingUtil")
library(geobuffer)
library(SamplingUtil)
library(velox)
tmap::tmap_mode("view")
library(rgeos)
library(randomForest)
library(VSURF)
library(modelr)
library(maxnet)
library(pROC)
library(dismo)
library(redlistr)
library(fasterize)

# set all standard directories
base_dir <<- "D:/cwrNA"
repo_dir <<- paste0(base_dir , "/src")
gap_dir <<- paste0(base_dir , "/gap_analysis")
par_dir <<- paste0(base_dir , "/parameters")
occ_dir <<- paste0(par_dir, "/occurenceData")
temp_dir <<- paste0(base_dir , "/TEMP")

#set name of the run version 
run_version <<- "test20200203"

#set adjustable parameters 
numPoints <<- 2000 # maximun number of points used in model (use in subSampleCountry.R)
bufferDist <<- 50000 # used to define buffer distance in gBuffer.r ## had to change from 0.5 when
#swtiched to geobuffer package for SF object generation. 
set.seed(1234)

# set all primary file sources
bioVars <<- readRDS(paste0(par_dir,"/bioLayer_2.5/climate_vx.RDS"))
countrySHP <<- readOGR(paste0(par_dir,"/ne_10m_admin/ne_10m_admin_0_countries.shp"),verbose = FALSE)
# exculing pacific territories- runs near all species faster 
#naSHP <<- readOGR(paste0(par_dir,"/northAmericaArea/northAmericaArea.shp"),verbose = FALSE)
# include pacific territories 
naSHP <<- readOGR(paste0(par_dir, "/allUSAArea/NorthAmerica_AllUSA.shp"), verbose = FALSE)
ecoReg <<- readOGR(paste0(par_dir,"/ecoregions/tnc_terr_ecoregions.shp"),verbose = FALSE)
occData <- data.table::fread("D:/cwrNA/occurrence_data2019_05_29/combinedOccurance2020-04-07.csv",
                              header = TRUE)
occData <<- occData[,2:ncol(occData)]
proArea <<- raster::raster(paste0(par_dir, "/protectedAreas/wdpa_reclass.tif"))
layerDescription <<- read.csv(paste0(par_dir, "/layerDesrciptions.csv"))
statesData <<- read.csv(paste0(par_dir, "/statePerTaxon/CWRofUSA_nativeareas_2020_1_30.csv"))
statesSpObject <<- readRDS(paste0(par_dir, "/statesByCountry/gadmCanUsaMex_sp.rds"))

# Load the sources scripts
source.files = list.files(repo_dir, ".[rR]$", full.names = TRUE, recursive = T)
source.files = source.files[ !grepl("dataBaseTransform", source.files) ]
source.files = source.files[ !grepl("test", source.files) ]
source.files = source.files[ !grepl("lineal", source.files) ]
source.files = source.files[ !grepl("summaryMarkdown", source.files) ]


#lapply(source.files, source)
for(i in 1:length(source.files)){
  cat(i,"\n")
  source(source.files[i])
}


# set loop at genus level
genera <- sort(unique(occData$genus))
testGen <- genera[1:length(genera)]

# optional species list for running specific species from the genera 
# spList <- c( "Acer saccharum subsp. ozarkense","Vanilla mexicanas")

# select all species at the genus level and apply master script to run process
beepr::beep_on_error(
  for(i in genera){
    t2a <- Sys.time()
    genus <<- i
    if (!file.exists(paste0(gap_dir,"/summaryDocs"))) {dir.create(paste0(gap_dir,"/summaryDocs"),recursive=T)}
    allSpec <- occData %>%
      dplyr::filter(genus == i)
    # generate a folder within the gap analysis
    folder <- paste0(occ_dir, "/",i)
    if (!file.exists(folder)) {dir.create(paste0(folder),recursive=T)}
    # test for genus level folder.
    genFolder <- paste0(gap_dir, "/", i)
    if (!file.exists(genFolder)) {dir.create(paste0(genFolder),recursive=T)}
    write.csv(allSpec, paste0(folder, "/", "raw",i,".csv"), row.names = FALSE)
    genusOcc <<- read.csv(paste0(folder, "/", "raw",i,".csv"))
    speciesList <<- sort(unique(allSpec$taxon))
    write.csv(x = speciesList, file = paste0(gap_dir,'/', genus, "/", 'speciesList.csv'))
    
    #test
    ### 20200227 here for trouble specific species in spList 
    speciesList <- speciesList[speciesList %in% spList]
    
    #calls the master function 
    result_master = lapply(speciesList, master_run)

    try(rmarkdown::render(paste0(repo_dir, "/summaryMarkdown/summaryOfGenus.rmd"),  # file 2
                             output_file =  paste("SummaryReport_", genus , Sys.Date(), ".html", sep=''),
                             output_dir = paste0(gap_dir,"/", genus,"/summaryDocs")))
    t2b <- Sys.time()
    totalTime <- t2b-t2a
    print(paste0("the genus ", genus," includes ",
                 length(speciesList), " in a total of ", totalTime," minutes."))
      
    }
)

D:/cwrNA/src/test/barPlots.R 

###
#20200325
# script to generate bar chart summarises 
# dan.carver@carverd@com
###
library(plotly)
# read in metrics data 
d1 <- read.csv("D:/cwrNA/runSummaries/allMetricData2020-04-12forFigures.csv")
dim(d1)
d1 <- d1[d1$Included.in.summary.metrics == "Y",]
dim(d1)

#set wd 
setwd("D:/cwrNA/parameters/barCharts")

# this is not prefect at the moment but it is working.. run the process for 


### generated the data structure I needed to test the plot method 
#d2 <- read.csv(file = "D:/cwrNA/parameters/barCharts/testchart.csv")

total <- length(d1$FCSc.mean.priority.category[!is.na(d1$FCSc.mean.priority.category)])
tA <- d1 %>% 
  group_by(FCSc.mean.priority.category) %>%
  dplyr::summarise(count =n())%>%
  dplyr::mutate(per=paste0(round(100*count/total,2)))
tA <- tA[c(1,3,2),]
tA
tI <- d1 %>% 
  group_by(FCSin.priority.category) %>%
  dplyr::summarise(count =n())%>%
  drop_na()%>% ### 20200411 added clause to drop na values. 
  dplyr::mutate(per=paste0(round(100*count/total,2)))
tI <- tI[c(1,3,2,4),]
tE <- d1 %>% 
  group_by(FCSex.priority.category) %>%
  dplyr::summarise(count =n())%>%
  drop_na()%>% ### 20200411 added clause to drop na values. 
  dplyr::mutate(per=paste0(round(100*count/total,2)))
tE <- tE[c(1,3,2,4),]

df2 <- data.frame(matrix(nrow = 3, ncol = 5))
colnames(df2) <- c("category" ,	"HP",	"MP",	"LP",	"SC")
df2$category <- as.factor(c("FCSc-mean","FCSex","FCSin" ))
df2[1,2:4] <- as.numeric(tA$per)
df2[1,5] <- as.numeric(0)
df2[2,2:5] <- as.numeric(tE$per)
df2[3,2:5] <- as.numeric(tI$per)

c1 <- factor(c("FCSin","FCSex","FCSc-mean"), levels = c("FCSin","FCSex","FCSc-mean"))


pg <- plot_ly(x = rev(df2$HP), y = c1,orientation = 'h',
              type = 'bar',  name = "HP",
              marker = list(color = 'rgba(204, 44, 0, 0.6)'
              ))%>% 
  add_trace(x = rev(df2$MP), name = 'MP',
            marker = list(color = 'rgba(255, 145, 0, 0.6)'
            ))%>% 
  add_trace(x = rev(df2$LP), name = 'LP',
            marker = list(color = 'rgba(255, 234, 0, 0.6)'
            ))%>% 
  add_trace(x = rev(df2$SC), name = 'SC',
            marker = list(color = 'rgba(3, 204, 0, 0.6)'
            ))%>% 
  layout(barmode = 'stack',
         xaxis = list(title = "Proportion of taxa (%)"),
         yaxis = list(title =""))
pg
htmlwidgets::saveWidget(pg,file = "totalInsituExsitu.html")

# priority group 
uc <- sort(unique(d1$Category))
priorityGroup <- data.frame(matrix(ncol = 4, nrow = length(uc)))
colnames(priorityGroup) <- c("category","HP", "MP","LP")
for(i in 1:length(uc)){
  p <- d1 %>%
    dplyr::filter(Category == uc[i])%>%
    dplyr::group_by(`FCSc.mean.priority.category`)%>%
    dplyr::summarise(count =n())%>%
    drop_na()%>% ### 20200411 added clause to drop na values. 
    dplyr::mutate(countT= sum(count)) %>%
    dplyr::mutate(per=paste0(round(100*count/countT,2)))
  
  p1 <- p[,c(1,4)] %>% tidyr::spread(FCSc.mean.priority.category,per) %>%
    dplyr::select(HP,MP,LP)
  priorityGroup$category[i] <- as.character(paste0(uc[i], "  "))
  priorityGroup$HP[i] <- as.numeric(p1$HP)
  priorityGroup$MP[i] <- as.numeric(p1$MP)
  priorityGroup$LP[i] <- as.numeric(p1$LP)
}
write.csv(x = priorityGroup, file = "priorityGroup.csv")

c1 <- factor(c("1C","1B","1A"), levels = c("1C","1B","1A"))

pg <- plot_ly(x = rev(priorityGroup$HP), y = c1,
              type = 'bar',  name = "HP",
              marker = list(color = 'rgba(204, 44, 0, 0.6)'
              ))%>% 
  add_trace(x = rev(priorityGroup$MP), name = 'MP',
            marker = list(color = 'rgba(255, 145, 0, 0.6)'
            ))%>% 
  add_trace(x = rev(priorityGroup$LP), name = 'LP',
            marker = list(color = 'rgba(255, 234, 0, 0.6)'
            ))%>%
  layout(barmode = 'stack',
         xaxis = list(title = "Proportion of taxa (%)"),
         yaxis = list(title =""))
pg 
htmlwidgets::saveWidget(pg,file = "priorityGroup.html")

# associated crop type 
uc <- sort(unique(d1$Associated.crop.type.specific))
priorityGroup <- data.frame(matrix(ncol = 4, nrow = length(uc)))
colnames(priorityGroup) <- c("category","HP", "MP","LP")
for(i in 1:length(uc)){
  p <- d1 %>%
    dplyr::filter(Associated.crop.type.specific == uc[i])%>%
    dplyr::group_by(`FCSc.mean.priority.category`)%>%
    dplyr::summarise(count =n()) %>%
    drop_na()%>% ### 20200411 added clause to drop na values. 
    dplyr::mutate(countT= sum(count)) %>%
    dplyr::mutate(per=paste0(round(100*count/countT,2)))
  p1 <- p[,c(1,4)] 
  if(nrow(p1) ==3){
    p1 <- p1 %>% tidyr::spread(FCSc.mean.priority.category,per) %>%
      dplyr::select(HP,MP,LP)
    priorityGroup$category[i] <- as.character(paste0(uc[i], "  "))
    priorityGroup$HP[i] <- as.numeric(p1$HP)
    priorityGroup$MP[i] <- as.numeric(p1$MP)
    priorityGroup$LP[i] <- as.numeric(p1$LP)
  }else{
    vals <- unique(p1$FCSc.mean.priority.category)
    priorityGroup$category[i] <- as.character(paste0(uc[i], "  "))
    p1 <- p1 %>% tidyr::spread(FCSc.mean.priority.category,per) %>%
      dplyr::select(vals)
    if("HP" %in% vals){
      priorityGroup$HP[i] <- as.numeric(p1$HP)
    }else{
      priorityGroup$HP[i] <- 0 
    }
    if("MP" %in% vals){
      priorityGroup$MP[i] <- as.numeric(p1$MP)
    }else{
      priorityGroup$MP[i] <- 0 
    }
    if("LP" %in% vals){
      priorityGroup$LP[i] <- as.numeric(p1$LP)
    }else{
      priorityGroup$LP[i] <- 0 
    }
  }
}
write.csv(x = priorityGroup, file = "associateCropType.csv")


uc1 <- rev(as.character(uc))


c1 <- factor(uc1, levels = uc1)

pg <- plot_ly(x = rev(priorityGroup$HP), y = c1,
              type = 'bar',  name = "HP",
              marker = list(color = 'rgba(204, 44, 0, 0.6)'
              ))%>% 
  add_trace(x = rev(priorityGroup$MP), name = 'MP',
            marker = list(color = 'rgba(255, 145, 0, 0.6)'
            ))%>% 
  add_trace(x = rev(priorityGroup$LP), name = 'LP',
            marker = list(color = 'rgba(255, 234, 0, 0.6)'
            ))%>%
  layout(barmode = 'stack',
         xaxis = list(title = "Proportion of taxa (%)"),
         yaxis = list(title =""))
pg
htmlwidgets::saveWidget(pg,file = "associateCropType.html")


# assocaited crop 
uc <- sort(unique(d1$Associated.crop.common.name))
priorityGroup <- data.frame(matrix(ncol = 4, nrow = length(uc)))
colnames(priorityGroup) <- c("category","HP", "MP","LP")
for(i in 1:length(uc)){
  p <- d1 %>%
    dplyr::filter(Associated.crop.common.name == uc[i])%>%
    dplyr::group_by(`FCSc.mean.priority.category`)%>%
    dplyr::summarise(count =n()) %>%
    drop_na()%>% ### 20200411 added clause to drop na values. 
    dplyr::mutate(countT= sum(count)) %>%
    dplyr::mutate(per=paste0(round(100*count/countT,2)))
  p1 <- p[,c(1,4)] 
  if(nrow(p1) ==3){
    p1 <- p1 %>% tidyr::spread(FCSc.mean.priority.category,per) %>%
      dplyr::select(HP,MP,LP)
    priorityGroup$category[i] <- as.character(paste0(uc[i], "  "))
    priorityGroup$HP[i] <- as.numeric(p1$HP)
    priorityGroup$MP[i] <- as.numeric(p1$MP)
    priorityGroup$LP[i] <- as.numeric(p1$LP)
  }else{
    vals <- unique(p1$FCSc.mean.priority.category)
    priorityGroup$category[i] <- as.character(paste0(uc[i], "  "))
    p1 <- p1 %>% tidyr::spread(FCSc.mean.priority.category,per) %>%
      dplyr::select(vals)
    if("HP" %in% vals){
      priorityGroup$HP[i] <- as.numeric(p1$HP)
    }else{
      priorityGroup$HP[i] <- 0 
    }
    if("MP" %in% vals){
      priorityGroup$MP[i] <- as.numeric(p1$MP)
    }else{
      priorityGroup$MP[i] <- 0 
    }
    if("LP" %in% vals){
      priorityGroup$LP[i] <- as.numeric(p1$LP)
    }else{
      priorityGroup$LP[i] <- 0 
    }
  }
}
write.csv(x = priorityGroup, file = "Associated.crop.common.name.csv")


uc1 <- rev(as.character(uc))


c1 <- factor(uc1, levels = uc1)

pg <- plot_ly(x = rev(priorityGroup$HP), y = c1,
              type = 'bar',  name = "HP",
              marker = list(color = 'rgba(204, 44, 0, 0.6)'
              ))%>% 
  add_trace(x = rev(priorityGroup$MP), name = 'MP',
            marker = list(color = 'rgba(255, 145, 0, 0.6)'
            ))%>% 
  add_trace(x = rev(priorityGroup$LP), name = 'LP',
            marker = list(color = 'rgba(255, 234, 0, 0.6)'
            ))%>%
  layout(barmode = 'stack',
         xaxis = list(title = "Proportion of taxa (%)"),
         yaxis = list(title =""))

pg
htmlwidgets::saveWidget(pg,file = "Associated.crop.common.name.html")

# redlist groups 

uc <- rev(c("Least Concern (LC)",
        "Possible Near Threatened (NT)",
        "Vulnerable (VU)",
        "Endangered (EN)",
        "Critically Endangered (CR)"))
priorityGroup <- data.frame(matrix(ncol = 4, nrow = length(uc)))
colnames(priorityGroup) <- c("category","HP", "MP","LP")

for(i in 1:length(uc)){
  p <- d1 %>%
    dplyr::filter(Combined.threat.assessment.status == uc[i])%>%
    dplyr::group_by(`FCSc.mean.priority.category`)%>%
    dplyr::summarise(count =n()) %>%
    dplyr::mutate(countT= sum(count)) %>%
    dplyr::mutate(per=paste0(round(100*count/countT,2)))
  p1 <- p[,c(1,4)] 
  if(nrow(p1) ==3){
    p1 <- p1 %>% tidyr::spread(FCSc.mean.priority.category,per) %>%
      dplyr::select(HP,MP,LP)
    priorityGroup$category[i] <- as.character(paste0(uc[i], "  "))
    priorityGroup$HP[i] <- as.numeric(p1$HP)
    priorityGroup$MP[i] <- as.numeric(p1$MP)
    priorityGroup$LP[i] <- as.numeric(p1$LP)
  }else{
    vals <- unique(p1$FCSc.mean.priority.category)
    priorityGroup$category[i] <- as.character(paste0(uc[i], "  "))
    p1 <- p1 %>% tidyr::spread(FCSc.mean.priority.category,per) %>%
      dplyr::select(vals)
    if("HP" %in% vals){
      priorityGroup$HP[i] <- as.numeric(p1$HP)
    }else{
      priorityGroup$HP[i] <- 0 
    }
    if("MP" %in% vals){
      priorityGroup$MP[i] <- as.numeric(p1$MP)
    }else{
      priorityGroup$MP[i] <- 0 
    }
    if("LP" %in% vals){
      priorityGroup$LP[i] <- as.numeric(p1$LP)
    }else{
      priorityGroup$LP[i] <- 0 
    }
  }
}
write.csv(x = priorityGroup, file = "redlistGroups.csv")


uc1 <- c("Least Concern (LC) ",
             "Possible Near Threatened (NT) ",
             "Vulnerable (VU) ",
             "Endangered (EN) ",
             "Critically Endangered (CR) ")


c1 <- factor(uc1, levels = uc1)

pg <- plot_ly(x = rev(priorityGroup$HP), y = c1,
              type = 'bar',  name = "HP",
              marker = list(color = 'rgba(204, 44, 0, 0.6)'
              ))%>% 
  add_trace(x = rev(priorityGroup$MP), name = 'MP',
            marker = list(color = 'rgba(255, 145, 0, 0.6)'
            ))%>% 
  add_trace(x = rev(priorityGroup$LP), name = 'LP',
            marker = list(color = 'rgba(255, 234, 0, 0.6)'
            ))%>%
  layout(barmode = 'stack',
         xaxis = list(title = "Proportion of taxa (%)"),
         yaxis = list(title =""))

pg 
htmlwidgets::saveWidget(pg,file = "redlistGroups.html")


# ### older function did not work due to the ordering of factors issue 
# #function for ploting
# plot2<- function(df){
#   fig <- plot_ly(data = df, x = ~HP, y = ~category,
#                  type = 'bar', orientation = 'h', name = "HP", 
#                  marker = list(color = 'rgba(204, 44, 0, 0.6)'
#                  ))%>% 
#     add_trace(x = ~MP, name = 'MP',
#               marker = list(color = 'rgba(255, 145, 0, 0.6)'
#               ))%>% 
#     add_trace(x = ~LP, name = 'LP',
#               marker = list(color = 'rgba(255, 234, 0, 0.6)'
#               ))%>% 
#     layout(barmode = 'stack',
#            xaxis = list(title = "Proportion of taxa (%)"),
#            yaxis = list(title =""))
#   return(fig)
# }
D:/cwrNA/src/test/combineAllAndModeledData.R 

###
# attempt to join current allData with previous modeled date. 
# 20200401
# dan.carver@carverd.com
###
library(data.table)
library(dplyr)

## read in 
ad<-data.table::fread("D:/cwrNA/occurrence_data2019_05_29/combinedOccurance2020-03-18.csv",
                      header = TRUE)

## read in sheets review by colin 
gw <- read.csv("D:/cwrNA/occurrence_data2019_05_29/troubleshootingDuplicates/duplicatedWiews-Genesys_ckforDan.csv")

ar <- read.csv("D:/cwrNA/occurrence_data2019_05_29/troubleshootingDuplicates/allGsFrom2020-03-18Data_ckforDantoremove.csv")

### filter out values from ad based on the key value. 
gw1 <- gw %>% 
  dplyr::filter(action == "Remove")
dim(gw1)
#filter the full dataset 
ad1 <- ad[!ad$V1 %in% unique(gw1$V1),]

#repeat for second file 
ar1 <- ar %>%
  dplyr::filter(action == "Remove")
dim(ar1)
# filter full dataset 
ad2 <- ad1[!ad1$V1 %in% unique(ar1$V1),]
#drop v1 column 
ad2<- ad2[,-1]
names(ad2)
#write out the full occurrence dataset 
write.csv(x = ad2, file = "D:/cwrNA/occurrence_data2019_05_29/combinedOccurance2020-04-03.csv")

# write out the G dataset 
ad3 <- ad2[ad2$type == "G",]
dim(ad3)
write.csv(x = ad3, file = "D:/cwrNA/occurrence_data2019_05_29/gOccurance2020-04-03.csv")

allG <- ad[ad$type == "G", ]
nrow(allG)-nrow(ad3)

### generate a specieslist to rerun 
spl <- data.frame(matrix(nrow = 0,ncol = 1))
colnames(spl) <- "taxon"
spl <- gw %>% dplyr::select(taxon)
spl
spl2 <- ar %>% dplyr::select(taxon)
spl3 <- rbind(spl, spl2) %>%
  dplyr::distinct()
write.csv(x = spl3, file = "D:/cwrNA/occurrence_data2019_05_29/troubleshootingDuplicates/speciesToReRunSRSex.csv")


### nothing down here should need to be repeated. 
### work to detect points of issue 

# filter for G points 
gs <- ad %>%
  dplyr::filter(type=="G")
gs$dID <- duplicated(gs$uniqueID)
gs$dIC <- duplicated(gs$institutionCode)
g1 <- gs %>% dplyr::filter(dID == TRUE & dIC ==TRUE)

# occurrence with duplicated unique id and institution code. 
gDup <- g1 %>%
  dplyr::group_by(taxon)%>%
  dplyr::summarise(count = n())
# per taxon test for two duplicates 
sp <- unique(g1$taxon)
nW <- data.frame(matrix(nrow=0,ncol=21))
for(i in sp){
  q1 <- gs %>% 
    dplyr::filter(taxon == i, databaseSource == c("Genesys", "wiews"))%>%
    arrange(databaseSource)
  q1$dID <- duplicated(q1$uniqueID)
  q1$dIC <- duplicated(q1$institutionCode)
  q1$remove <- q1$dID == TRUE & q1$dIC == TRUE
  q1$latLong <- !is.na(q1$latitude) & !is.na(q1$longitude)
  nW <- rbind(nW,q1)
  print(i)
}
write.csv(x = nW, file = "D:/cwrNA/occurrence_data2019_05_29/duplicatedWiews-Genesys.csv")

# summarize what is to be removed. 
s1 <- nW %>%
  dplyr::filter(remove == TRUE)%>%
  dplyr::group_by(taxon,latLong)%>%
  dplyr::summarise(totals = n())
write.csv(x = s1, file = "D:/cwrNA/occurrence_data2019_05_29/valuesToRemoveWithLatLong.csv")


g2 <- left_join(gDup, gCount, by = "taxon")
g2$percent <- (g2$count / g2$total)*100
View(g2)
write.csv(x = g2, file = "D:/cwrNA/occurrence_data2019_05_29/duplicatedUniqueIDWithPercents.csv")

write.csv(x = gs, file = "D:/cwrNA/occurrence_data2019_05_29/allGsFrom2020-03-18Data.csv")

# work on a means of identifying what occurrence data was used in the modeling process
# summarize at the genus level 

# define run 
run <- "test20200203"
base <- "D:/cwrNA"
# all genera 
genera <- unique(ad$genus)
for(i in 1:length(genera)){
  g <- genera[i]
  sp <- unique(ad %>%
    dplyr::filter(genus == g)%>%
    dplyr::select(taxon))[,1]
  dfRaw <- data.frame(matrix(nrow = 0, ncol = 17))
  colnames(dfRaw) <- c("V1","taxon","genus", "species","latitude","longitude","databaseSource","institutionCode",
                       "type","uniqueID","sampleCategory","country","iso3","localityInformation", "biologicalStatus","collectionSource","finalOriginStat")
  dfNA <- data.frame(matrix(nrow = 0, ncol = 8))
  colnames(dfNA) <- c("taxon","latitude","longitude","type","databaseSource" ,"hasLat","hasLong","hasLatLong")
  dfNAWithID <- data.frame(matrix(nrow = 0, ncol=21))
  colnames(dfNAWithID) <- c("taxon","latitude","longitude","type","databaseSource",
                            "genus", "species", "institutionCode", "uniqueID", "sampleCategory","country",
                            "iso3","localityInformation","biologicalStatus", "collectionSource","finalOriginStat")
  for(s in sp){
    print(s)
    #pull in raw data 
    path <- paste0(base, "/gap_analysis/",genera[i], "/", s, "/", run, "/occurrences/rawData.csv")
    if(file.exists(path)){
      t1 <- read.csv(path)
      t1$latitude <- as.numeric(as.character(t1$latitude))
      t1$longitude <- as.numeric(as.character(t1$longitude))
      dfRaw <- rbind(dfRaw, t1)
    }
    #pull in data in NA 
    path <- paste0(base, "/gap_analysis/",genera[i], "/", s, "/", run, "/occurrences/rawDataForNA.csv")
    if(file.exists(path)){
      t2 <- read.csv(path)
      dfNA <- rbind(dfNA, t2)
      
    }
    # join to connect all data to NA data 
    d2 <- t2 %>% dplyr::mutate(ID = row_number())
    dfAll <- dplyr::left_join(d2, dfRaw, by = c("taxon" = "taxon",
                                                  "latitude" = "latitude",
                                                  "longitude" = "longitude",
                                                "type"="type",
                                                "databaseSource" = "databaseSource"))
    #drop values based on repeat ID 
    dfAll <- dfAll[!duplicated(dfAll$ID),]%>%
      dplyr::select("taxon","latitude","longitude","type","databaseSource",
"genus", "species", "institutionCode", "uniqueID", "sampleCategory","country",
"iso3","localityInformation","biologicalStatus", "collectionSource","finalOriginStat")
  dfNAWithID <- rbind(dfNAWithID, dfAll)
  }
  write.csv(x = dfRaw, file = paste0(base, "/gap_analysis/",genera[i],"/compiledRawData.csv"))
  write.csv(x = dfNA, file = paste0(base, "/gap_analysis/",genera[i],"/compiledNorthAmericanData.csv"))
  write.csv(x = dfNAWithID, file = paste0(base, "/gap_analysis/",genera[i],"/compiledNADataWithAttributes.csv"))
}


#### 20200402 
# we are going to work with older occurrence datasets 

# read in modeling data 
md <- data.table::fread("D:/cwrNA/runSummaries/allspeciesOccurrenceData2020-03-26.csv")
# read in current all data 
ad <- data.table::fread("D:/cwrNA/occurrence_data2019_05_29/combinedOccurance2020-04-01.csv", header = TRUE)
ad$latitude <- as.numeric(ad$latitude)
ad$longitude <- as.numeric(ad$longitude)
ad$dbSourceTemp <- ad$databaseSource
# attempt a join 
dd <- dplyr::left_join(md, ad, by=c("taxon"="taxon", "latitude"="latitude", 
                                    "longitude"="longitude"))
# because of duplicated lat long some records are regenerated 
# remote those records. 
di <- dd[!duplicated(dd$V1.x),]

d1 <- di %>% dplyr::select("taxon",
"latitude","longitude","type.x","dbSourceTemp",
"hasLat","hasLong","hasLatLong","iso3_check",
"StateTest","genus","species","institutionCode",
"uniqueID","sampleCategory","country","iso3",
"localityInformation","biologicalStatus",
"collectionSource","finalOriginStat")
colnames(d1) <- c("taxon",
                  "latitude","longitude","type","databaseSource",
                  "hasLat","hasLong","hasLatLong","iso3_check",
                  "StateTest","genus","species","institutionCode",
                  "uniqueID","sampleCategory","country","iso3",
                  "localityInformation","biologicalStatus",
                  "collectionSource","finalOriginStat")
write.csv(x = d1, file = "D:/cwrNA/runSummaries/modelDataWithSoruce2020-04-01.csv")
D:/cwrNA/src/test/deletingFiles.R 

### 
# a script that will clean all older model run from the computer 
# 20200108 
# dan.carver@carverd.com
### 

# background 


baseDir <- "D:/cwrUSA/gap_analysis"

oldFiles <- list.dirs(path = baseDir, full.names = TRUE, recursive = TRUE) 

occData <- data.table::fread("D:/cwrNA/occurrence_data2019_05_29/combinedOccurance2020-04-07.csv",
                             header = TRUE)
occData <- occData[,2:nrow(occData)]
speciesList <- unique(occData$taxon)

old2 <- oldFiles[grep(pattern = "test20191023$", x = oldFiles)]
#filter files by the species list 
old3 <- old2[speciesList %in% old2]

n <- list.files(old2[1], recursive = TRUE)

unlink(x = old2[6], recursive = TRUE)


allFiles <- list.files(path="D:/cwrNA/gap_analysis/Capsicum", full.names = TRUE, recursive = TRUE) 
oldFiles <- allFiles[grep(pattern = "2019-11-06"| "2019-11-05", x = allFiles)]
unlink(x=oldFiles) 


deleteALot <- function(directory,pattern){
  allFolders <- list.dirs(path = directory, full.names = TRUE, recursive = TRUE)
  oldFolders <- allFolders[grep(pattern = pattern, x = allFolders)]
  unlink(x = oldFolders, recursive = TRUE)
  print(paste0("All files and folders containing ", pattern, " are gone forever."))
}


listOfPatterns <- c("test20200131$")

for(i in listOfPatterns){
  deleteALot(directory = "D:/cwrUSA/gap_analysis", pattern = i)
}
D:/cwrNA/src/test/generateAllMetricData.R 

###
# generating all summary data metrics 
# 20200408 
# dan.carver@carverd.com 
### 


# occurrence data from data prep step 
d1 <- read.csv("D:/cwrNA/parameters/USA_cropWildRelativeInventory/CWRofUSA_Inventory_2020_02_26.csv", header = TRUE)
fL <- d1 %>% dplyr::select(Taxon_GRIN.Global_2019.final)

# pull in original data from the project 
CWRuslist <- read.csv("D:/cwrNA/speciesList/CWRoftheUSA_synonyms20191114.csv")
tL <- CWRuslist %>% dplyr::select(Taxon_GRIN.Global_2019.final)

# join to select species that are on both list 
fullList <- dplyr::inner_join(x = fL, y= tL, by = "Taxon_GRIN.Global_2019.final") %>%
  dplyr::distinct()

#genera List 
genera <- sort(unique(occData$genus))
# set run version 
run_v <- "test20200203"
## pull counts, gap analysis scores, and redlist scores from the model run folder 

# create empty dataframe for counts CSV 
df1 <- data.frame(matrix(nrow = nrow(fullList), ncol = 1))
colnames(df1) <- "species"
df1$species <- as.character(sort(fullList$Taxon_GRIN.Global_2019.final))
# loop over all species append values 
n=1
for(i in genera){
  #select all species in genera 
  oc1 <- occData %>% 
    dplyr::filter(genus == i)
  spList2 <- df1[grep(pattern = i,x = df1$species),]
  for(j in spList2){
    sp_dir <- paste0("D:/cwrNA/gap_analysis/",i,"/",j,"/",run_v)
    # test for file and read it as object 
    if(file.exists(paste0(sp_dir, "/counts.csv"))){
      ct <- read.csv(paste0(sp_dir, "/counts.csv"))
      if(n==1){
        ctAll <- ct 
      }else{
        ctAll <- dplyr::bind_rows(ctAll, ct)
      }
    }
    if(file.exists(paste0(sp_dir, "/gap_analysis/exsitu/summary.csv"))){
      gE <- read.csv(paste0(sp_dir, "/gap_analysis/exsitu/summary.csv"))
      #assign classes 
      if (gE$FCS < 25) {
        gE$Exsitu_Score <- "HP"
      } else if (gE$FCS >= 25 & gE$FCS < 50) {
        gE$Exsitu_Score <- "MP"
      } else if (gE$FCS >= 50 & gE$FCS < 75) {
        gE$Exsitu_Score <- "LP"
      } else {
        gE$Exsitu_Score <- "SC"
      }
      if(n==1){
        gEAll <- gE 
      }else{
        gEAll <- dplyr::bind_rows(gEAll, gE)
      }
    }
    if(file.exists(paste0(sp_dir, "/gap_analysis/insitu/summary.csv"))){
      gI <- read.csv(paste0(sp_dir, "/gap_analysis/insitu/summary.csv")) %>%
        dplyr::select("ID","SRS.NTOTAL",	"SRS.ProTotal","SRS.SRS","SRS.SRS","GRS","ERS","FCS")
      if (gI$FCS < 25) {
        gI$Insitu_Score <- "HP"
      } else if (gI$FCS >= 25 & gI$FCS < 50) {
        gI$Insitu_Score <- "MP"
      } else if (gI$FCS >= 50 & gI$FCS < 75) {
        gI$Insitu_Score <- "LP"
      } else {
        gI$Insitu_Score <- "SC"
      }
      if(n==1){
        gIAll <- gI 
      }else{
        gIAll <- dplyr::bind_rows(gIAll, gI)
      }
    }
    if(file.exists(paste0(sp_dir, "/gap_analysis/combined/fcs_combined.csv"))){
      gF <- read.csv(paste0(sp_dir, "/gap_analysis/combined/fcs_combined.csv"))
      if(n==1){
        gFAll <- gF 
      }else{
        gFAll <- dplyr::bind_rows(gFAll, gF)
      }
    }
    if(file.exists(paste0(sp_dir, "/gap_analysis/redList/listingValues4kmClean.csv"))){
      rL <- read.csv(paste0(sp_dir, "/gap_analysis/redList/listingValues4kmClean.csv"))
      if(n==1){
        rlAll <- rL 
      }else{
        rlAll <- dplyr::bind_rows(rlAll, rL)
      }
    }
    n= n+1
  }
  print(paste0(i, " have been compiled"))
}

# generate the combined score for redList Values 
rlAll$aVal <- NA
rlAll$eVal <- NA
# add numeric values based on status 
for(i in 1:nrow(rlAll)){
  if (rlAll$AOO.Status[i] == "Least Concern (LC)"){
    rlAll$aVal[i] <- 1 }
  if (rlAll$AOO.Status[i] == "Possible Near Threatened (NT)"){
    rlAll$aVal[i] <- 2 }
  if (rlAll$AOO.Status[i] == "Vulnerable (VU)"){
    rlAll$aVal[i] <- 3 }
  if (rlAll$AOO.Status[i] == "Endangered (EN)"){ 
    rlAll$aVal[i] <- 4 }
  if (rlAll$AOO.Status[i] == "Critically Endangered (CR)"){
    rlAll$aVal[i] <- 5 }
  # EOO values 
  if (rlAll$EOO.Status[i] == "Least Concern (LC)"){
    rlAll$eVal[i] <- 1 }
  if (rlAll$EOO.Status[i] == "Possible Near Threatened (NT)"){
    rlAll$eVal[i] <- 2 }
  if (rlAll$EOO.Status[i] == "Vulnerable (VU)"){
    rlAll$eVal[i] <- 3 }
  if (rlAll$EOO.Status[i] == "Endangered (EN)"){ 
    rlAll$eVal[i] <- 4 }
  if (rlAll$EOO.Status[i] == "Critically Endangered (CR)"){
    rlAll$eVal[i] <- 5 }
  
  if(rlAll$eVal[i] >= rlAll$aVal[i]){
    stat <- rlAll$EOO.Status[i]
  }else{
    stat <- rlAll$AOO.Status[i]
  }
  
  rlAll$`Combined Status`[i] <- stat
}

rlAll <- rlAll %>% dplyr::select(c("taxon","EOO.Area.km2","EOO.Status","AOO",
                                   "AOO.adjusted.Minimum","AOO.Status","Combined Status"))
# join based on full specices list to identify non present species 
ctFull <- dplyr::full_join(x = df1,y=ctAll, by = "species")

gEFull <- dplyr::full_join(x = ctFull,y=gEAll, by = c("species" = "ID"))

gIFull <- dplyr::full_join(x = gEFull,y=gIAll, by =  c("species" = "ID"))

gFFull <- dplyr::full_join(x = gIFull,y=gFAll, by =  c("species" = "ID"))

allSummary <- dplyr::full_join(x = gFFull,y=rlAll, by = c("species" = "taxon"))


# add field based on if speciecs will be included in the higher level analysis 
noSS <- c("Phaseolus acutifolius","Phaseolus leptostachyus","Elymus elymoides","Leymus mollis","Phaseolus maculatus","Hordeum jubatum","Helianthus petiolaris","Ribes sanguineum","Phaseolus polystachios","Prunus serotina","Elymus trachycaulus","Hordeum brachyantherum","Ribes roezlii","Rubus hispidus","Ribes hudsonianum","Helianthus nuttallii","Helianthus pauciflorus","Humulus lupulus","Allium geyeri","Ribes oxyacanthoides","Fragaria x ananassa","Helianthus occidentalis","Fragaria virginiana","Elymus lanceolatus","Fragaria vesca","Helianthus niveus","Helianthus praecox","Prunus fasciculata","Ribes malvaceum","Rubus arcticus","Vitis rotundifolia","Fragaria chiloensis","Ribes aureum","Acer saccharum","Allium victorialis","Elymus stebbinsii","Helianthus debilis","Ipomoea ternifolia","Lactuca tatarica","Prunus ilicifolia","Prunus pumila","Ribes californicum","Rubus idaeus","Saccharum brevibarbe","Vitis aestivalis","Vitis cinerea","Zizania aquatica","Zizania palustris", "Allium schoenoprasum","Elymus glabriflorus",
          "Elymus glaucus","Ipomoea cordatotriloba","Juglans major","Juglans microcarpa","Leymus salina","Prunus virginiana","Ribes cereum","Rubus ursinus","Tripsacum dactyloides","Vaccinium crassifolium","Vaccinium erythrocarpum","Vaccinium ovalifolium"
)
allSummary$`Included in Summaries` <- !allSummary$species %in% noSS 


# change column names 
allSummary <- allSummary %>% dplyr::select(-Exsitu_Score)


View(allSummary)
#drop NA row 
allSummary <- allSummary[2:nrow(allSummary),]

tna <- allSummary$totalUseful - allSummary$NA_occurrences
View(tna)
naMa <- allSummary$NA_occurrences - allSummary$SRS.NTOTAL
View(naMa)
newCols <- c("Species",
             "Total Records",	"Records with latitude",	"Records with longitude",	
             "Records with coordinates",
             " Total G records",	'Total G records with coordinates',	
             "Total H Records",	"Total H with coordinates",
             "Number of unique data sources",
             "Total occurrences in North America",	"Total G occurrences in North America",
             "Total H occurrences in North America",	
             "SRSex",	"GRSex",	"ERSex",	"FCSex", "Exsitu Conservation Score",
             "Total occurrences in modeled area",	
             "Total occcurrens in modeled area in protected areas",
             "SRSin",	"GRSin",	"ERSin",	"FCSin", "Insitu Conservation Score",	
             "FCSex_value",	"FCSin_value",	
             "FCSc_min",	"FCSc_max",	"FCSc mean",
             "FCSc_min priority category",	"FCSc_max priority category",	
             "FCSc mean priority category",
             "EOO area km2", "EOO status",	"AOO",	"AOO adjusted minimum",	"AOO status",
             "Combined status", "Included in Summaries"
)

colnames(allSummary) <- newCols
View(allSummary)
write.csv(x = allSummary, file = paste0("D:/cwrNA/runSummaries/allMetricData", Sys.Date(), ".csv"))


# :) run from here


# inport the CWR inventory and join the priority level and the crop type for futher summaries 
cwrIn <- d1 %>% dplyr::select("Taxon_GRIN.Global_2019.final","Crop.or.WUS.use_general", 
                              "Priority.2019","Crop.or.WUS.use_1","Genus","Associated_crop_common.name")

cwrIn$name <- as.character(cwrIn$Taxon_GRIN.Global_2019.final) 
useGroup <- dplyr::left_join(x = allSummary ,y= cwrIn, by= c("Species" = "name"))
View(useGroup)
### adding the median model data to this data to double check true model runs for each species 
allM <- read.csv("D:/cwrNA/runSummaries/median_summary_test20200203.csv")
dt2 <- dplyr::left_join(useGroup, allM, by = c("Species" = "species"))

### generate content for figures 
dFig <- dt2[,c(1,
               43,42,44,45,46, 
               14:18,
               21:25,
               26,27,
               28:30,
               31:33,
               35,38,
               39)]
colnames(dFig) <- c("Taxon",
                    "Category",	"Associated crop type general","Associated crop type specific", "Genus",	"Associated crop",
                    "SRSex",	"GRSex",	"ERSex",	"FCSex",	"FCSex priority category",
                    "SRSin",	"GRSin",	"ERSin",	"FCSin",	"FCSin priority category",
                    "FCSex value",	"FCSin value",
                    "FCSc_min",	"FCSc_max",	"FCSc mean",
                    "FCSc_min priority category", "FCSc_max priority category",	"FCSc mean priority category",
                    "EOO status", "AOO status",
                    "Combined threat assessment status")
View(dFig)
write.csv(x = dFig, file = paste0("D:/cwrNA/runSummaries/allMetricData_ForFigures", Sys.Date(), ".csv"))


D:/cwrNA/src/test/generatingCountrySummaryCountsForSpecies.R 

###
# generate counts of all  species and species in North america 
# 20191112
# carver.dan1@gmail.com
### 
library(tidyverse)
library(sp)
library(rgdal)
library(rgeos)
library(raster)

base_dir <<- "D:/cwrNA"
par_dir <<- paste0(base_dir , "/parameters")
occ_dir <<- paste0(par_dir, "/occurenceData")

# read in raw data 
occData <<- data.table::fread(paste0(base_dir, "/occurrence_data2019_05_29/combinedOccurance2019-12-13.csv"),header = TRUE)
naSHP <<- readOGR(paste0(par_dir,"/northAmericaArea/northAmericaArea.shp"),verbose = FALSE)
naSHP@data <- naSHP@data %>% dplyr::select(-c(1:95))#

# check for duplicates with latlong and 


# getting counts of species across the globe. 
d1 <- occData %>%
  mutate(hasLat = !is.na(latitude) & latitude != "\\N" & latitude != "") %>%
  mutate(hasLong = !is.na(longitude) & longitude != "\\N"& longitude != "") %>%
  mutate(hasLatLong = hasLat & hasLong)
d2 <- d1 %>%
  group_by(taxon, hasLatLong)%>%
  dplyr::summarise(count = n())
write.csv(d2,file=paste0(occ_dir, "/allDataCounts", Sys.Date(),".csv"))

# filter to the general area of the USA to drop points before intersect 
d1a <- d1 %>%
  filter(latitude > 10)%>%
  filter(longitude < -50)

coord <- d1a %>%
  filter(hasLatLong == TRUE) %>%
  dplyr::select(longitude,latitude)
coord[] <- lapply(coord, function(x) as.numeric(x))

c1 <- coord[complete.cases(coord),]

d3 <- filter(d1a, hasLatLong == TRUE) 


spPoint <- SpatialPointsDataFrame( coords = c1 ,data = d3)

#proj4string = CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"))
if( nrow(spPoint@data) == 0){
  print("there are no coodinate pairs for this species")
  spPoint <<- "no data available"
}
# mask to North America 
crs(spPoint) <- crs(naSHP)


intersect1 <- intersect(spPoint, naSHP)
write.csv(intersect1@data,file=paste0(occ_dir, "/allNorthAmericaOccuenceData", Sys.Date(),".csv"))


d2a <- intersect1@data %>%
  group_by(taxon)%>%
  dplyr::summarise(count = n())

write.csv(d2a,file=paste0(occ_dir, "/allNorthAmericaCounts", Sys.Date(),".csv"))

D:/cwrNA/src/test/occurrenceDataByGenus.R 

###
# compile genus level data for all species based on the run data from 202002 model runs 
# dan.carver@carverd.com
# 20200414
###
library(tidyverse)
#simple option 
# use existing occurrence database 

d1 <- data.table::fread("D:/cwrNA/occurrence_data2019_05_29/combinedOccurance2020-04-07.csv",
                             header = TRUE)
d1 <- d1[,2:ncol(d1)]
genera <- sort(unique(d1$genus))

#replace source for midwest herdarium 
t1 <- d1[d1$databaseSource == "midwestHerbarium ",]
t1$databaseSource <- "Consortium of Midwest Herbaria (2019)"
t2 <- d1[d1$databaseSource != "midwestHerbarium ",]
d1 <- rbind(t1,t2)


o1 <- "D:/cwrNA/occurrence_data2019_05_29/genusOccurrences"
c1 <- data.frame(matrix(nrow = 0,ncol=2))
colnames(c1) <- c("taxon", "count")
# loop over genera 
for(i in genera){
  d2 <- d1[d1$genus == i,]
  write.csv(x = d2, file = paste0(o1, "/",i,Sys.Date(),".csv"))
  d3 <- d2 %>%
    dplyr::group_by(taxon) %>%
    dplyr::summarise(count = n())
  c1 <- rbind(c1,d3)
}
# pull the data summary document and join to check species total occurrences 
d4 <- read.csv("D:/cwrNA/runSummaries/allMetricData2020-04-12forFigures.csv")
d4 <- d4[,c(1,8,11)]

j1 <- dplyr::left_join(c1, d4, by=c("taxon"="Taxon")) %>%
  dplyr::mutate(differnce = count - Total.Records)
View(j1)
write.csv(x = j1, file = paste0(o1, "/summaryCounts",Sys.Date(),".csv"))
D:/cwrNA/src/test/priority_score_summary_graph_2020.R 

# H. Achicanoy
# CIAT, 2019

library(tidyverse)
library(grid)

root <- 'D:/ToBackup/others/cwr_us'
df   <- readxl::read_excel(paste0(root,'/allMetricData2020-03-20_forfigsonly.xlsx'), sheet = 1)
cats <- c('Category','Associated crop type specific','Associated crop')

# Category graph
tbl <- df %>%
  dplyr::select('Taxon','Category','FCSc mean')
tbl %>%
  ggplot2::ggplot(aes(x = factor(Category,levels=c("(1)C","(1)B","(1)A")), y = `FCSc mean`, group = x)) +
  ggplot2::theme_bw() +
  ggplot2::geom_blank() +
  ggplot2::coord_flip() +
  ggplot2::ylim(0, 100) +
  ggplot2::annotate("rect", ymin=0, ymax=25, xmin=0.5, xmax=3.5, alpha=.25, fill="red") +
  ggplot2::annotate("rect", ymin=25, ymax=50, xmin=0.5, xmax=3.5, alpha=.5, fill="orange") +
  ggplot2::annotate("rect", ymin=50, ymax=75, xmin=0.5, xmax=3.5, alpha=.4, fill="yellow") +
  ggplot2::annotate("rect", ymin=75, ymax=100, xmin=0.5, xmax=3.5, alpha=.3, fill="forestgreen") +
  ggplot2::geom_point() +
  ggplot2::geom_point(data = tbl %>%
                        dplyr::group_by(Category) %>%
                        dplyr::summarise(`FCSc mean` = median(`FCSc mean`)), size = 5, colour = 'red') +
  ggplot2::ylab(label="Final conservation score (FCSc mean)") + ggplot2::xlab("") +
  ggplot2::scale_y_continuous(breaks = seq(0, 100, 10)) +
  ggplot2::theme(panel.grid.major.x = element_blank(),
                 panel.grid.minor.x = element_blank(),
                 panel.grid.major.y = element_line(colour="grey60", linetype="dashed"),
                 axis.text.x  = element_text(size=15),
                 axis.text.y  = element_text(face="italic",size=15),
                 axis.title.x = element_text(face="bold",size=15),
                 axis.title.y = element_text(face="bold",size=15)) +
  ggplot2::annotate("text",
                    y        = c(12.5, 37.5, 62.5, 87.5),
                    x        = 3.4,
                    label    = c("HP","MP","LP","SC"),
                    colour   = "black",
                    size     = 4,
                    fontface = 2) +
  ggsave(paste0(root,"/category.png"), device = "png", units = "in", width = 10, height = 8, dpi = 320)

# Associated crop type specific
tbl <- df %>%
  dplyr::select('Taxon','Associated crop type specific','FCSc mean')
colnames(tbl)[2] <- 'Category'
lbls <- (tbl$Category %>% unique)[tbl$Category %>% unique %>% order(decreasing = T)]
tbl %>%
  ggplot2::ggplot(aes(x = factor(Category,levels=lbls), y = `FCSc mean`, group = x)) +
  ggplot2::theme_bw() +
  ggplot2::geom_blank() +
  ggplot2::coord_flip() +
  ggplot2::ylim(0, 100) +
  ggplot2::annotate("rect", ymin=0, ymax=25, xmin=0.5, xmax=14, alpha=.25, fill="red") +
  ggplot2::annotate("rect", ymin=25, ymax=50, xmin=0.5, xmax=14, alpha=.5, fill="orange") +
  ggplot2::annotate("rect", ymin=50, ymax=75, xmin=0.5, xmax=14, alpha=.4, fill="yellow") +
  ggplot2::annotate("rect", ymin=75, ymax=100, xmin=0.5, xmax=14, alpha=.3, fill="forestgreen") +
  ggplot2::geom_point() +
  ggplot2::geom_point(data = tbl %>%
                        dplyr::group_by(Category) %>%
                        dplyr::summarise(`FCSc mean` = median(`FCSc mean`)), size = 5, colour = 'red') +
  ggplot2::ylab(label="Final conservation score (FCSc mean)") + ggplot2::xlab("") +
  ggplot2::scale_y_continuous(breaks = seq(0, 100, 10)) +
  ggplot2::theme(panel.grid.major.x = element_blank(),
                 panel.grid.minor.x = element_blank(),
                 panel.grid.major.y = element_line(colour="grey60", linetype="dashed"),
                 axis.text.x  = element_text(size=15),
                 axis.text.y  = element_text(face="italic",size=15),
                 axis.title.x = element_text(face="bold",size=15),
                 axis.title.y = element_text(face="bold",size=15)) +
  ggplot2::annotate("text",
                    y        = c(12.5, 37.5, 62.5, 87.5),
                    x        = 13.6,
                    label    = c("HP","MP","LP","SC"),
                    colour   = "black",
                    size     = 4,
                    fontface = 2) +
  ggsave(paste0(root,"/associated_crop_type_specific.png"), device = "png", units = "in", width = 10, height = 8, dpi = 320)

# Associated crop
tbl <- df %>%
  dplyr::select('Taxon','Associated crop','FCSc mean')
colnames(tbl)[2] <- 'Category'
lbls <- (tbl$Category %>% unique)[tbl$Category %>% unique %>% order(decreasing = T)]
tbl %>%
  ggplot2::ggplot(aes(x = factor(Category,levels=lbls), y = `FCSc mean`, group = x)) +
  ggplot2::theme_bw() +
  ggplot2::geom_blank() +
  ggplot2::coord_flip() +
  ggplot2::ylim(0, 100) +
  ggplot2::annotate("rect", ymin=0, ymax=25, xmin=0, xmax=53, alpha=.25, fill="red") +
  ggplot2::annotate("rect", ymin=25, ymax=50, xmin=0, xmax=53, alpha=.5, fill="orange") +
  ggplot2::annotate("rect", ymin=50, ymax=75, xmin=0, xmax=53, alpha=.4, fill="yellow") +
  ggplot2::annotate("rect", ymin=75, ymax=100, xmin=0, xmax=53, alpha=.3, fill="forestgreen") +
  ggplot2::geom_point() +
  ggplot2::geom_point(data = tbl %>%
                        dplyr::group_by(Category) %>%
                        dplyr::summarise(`FCSc mean` = median(`FCSc mean`)), size = 5, colour = 'red') +
  ggplot2::ylab(label="Final conservation score (FCSc mean)") + ggplot2::xlab("") +
  ggplot2::scale_y_continuous(breaks = seq(0, 100, 10)) +
  ggplot2::theme(panel.grid.major.x = element_blank(),
                 panel.grid.minor.x = element_blank(),
                 panel.grid.major.y = element_line(colour="grey60", linetype="dashed"),
                 axis.text.x  = element_text(size=15),
                 axis.text.y  = element_text(face="italic",size=15),
                 axis.title.x = element_text(face="bold",size=15),
                 axis.title.y = element_text(face="bold",size=15)) +
  ggplot2::annotate("text",
                    y        = c(12.5, 37.5, 62.5, 87.5),
                    x        = 52,
                    label    = c("HP","MP","LP","SC"),
                    colour   = "black",
                    size     = 4,
                    fontface = 2) +
  ggsave(paste0(root,"/associated_crop.png"), device = "png", units = "in", width = 10, height = 12, dpi = 320)
D:/cwrNA/src/test/priority_score2020.R 

# H. Achicanoy
# CIAT, 2019

library(tidyverse)
library(grid)
library(readxl)

root <- 'D:/cwrNA/parameters/priorityFigure'
df   <- read.csv("D:/cwrNA/runSummaries/allMetricData2020-04-12forFigures.csv")
cats <- c('Category','Associated.crop.type.specific','Associated.crop.common.name')
dim(df)
df <- df[df$Included.in.summary.metrics == "Y",]
dim(df)
# Category graph
tbl <- df %>%
  dplyr::select('Taxon','Category','FCSc.mean')
tbl %>%
  ggplot2::ggplot(aes(x = factor(Category,levels=c("(1)C","(1)B","(1)A")),
                      y = `FCSc.mean`, group = factor(Category,levels=c("(1)C","(1)B","(1)A"))))+
  ggplot2::theme_bw() +
  ggplot2::geom_blank() +
  ggplot2::coord_flip() +
  ggplot2::ylim(0, 100) +
  ggplot2::annotate("rect", ymin=0, ymax=25, xmin=0.5, xmax=3.5, alpha=.25, fill="red") +
  ggplot2::annotate("rect", ymin=25, ymax=50, xmin=0.5, xmax=3.5, alpha=.5, fill="orange") +
  ggplot2::annotate("rect", ymin=50, ymax=75, xmin=0.5, xmax=3.5, alpha=.4, fill="yellow") +
  ggplot2::annotate("rect", ymin=75, ymax=100, xmin=0.5, xmax=3.5, alpha=.3, fill="forestgreen") +
  #ggplot2::geom_point() +
  ggplot2::geom_jitter(width = 0.30)+
  ggplot2::geom_point(data = tbl %>%
                        dplyr::group_by(Category) %>%
                        dplyr::summarise(`FCSc.mean` = mean(`FCSc.mean`, na.rm=TRUE)), size = 5, colour = 'red') +
  ggplot2::ylab(label="Final conservation score (FCSc-mean)") + ggplot2::xlab("") +
  ggplot2::scale_y_continuous(breaks = seq(0, 100, 10)) +
  ggplot2::theme(panel.grid.major.x = element_blank(),
                 panel.grid.minor.x = element_blank(),
                 panel.grid.major.y = element_line(colour="grey60", linetype="dashed"),
                 axis.text.x  = element_text(size=15),
                 axis.text.y  = element_text(face="plain",size=15),
                 axis.title.x = element_text(face="bold",size=15),
                 axis.title.y = element_text(face="bold",size=15)) +
  ggplot2::annotate("text",
                    y        = c(12.5, 37.5, 62.5, 87.5),
                    x        = 3.4,
                    label    = c("HP","MP","LP","SC"),
                    colour   = "black",
                    size     = 4,
                    fontface = 2) +
  ggsave(paste0(root,"/category.png"), device = "png", units = "in", width = 10, height = 6, dpi = 320)

# Associated.crop.type.specific
tbl <- df %>%
  dplyr::select('Taxon','Associated.crop.type.specific','FCSc.mean')
colnames(tbl)[2] <- 'Category'
lbls <- (tbl$Category %>% unique)[tbl$Category %>% unique %>% order(decreasing = T)]
tbl %>%
  ggplot2::ggplot(aes(x = factor(Category,levels=lbls), y = `FCSc.mean`, group = factor(Category,levels=lbls))) +
  ggplot2::theme_bw() +
  ggplot2::geom_blank() +
  ggplot2::coord_flip() +
  ggplot2::ylim(0, 100) +
  ggplot2::annotate("rect", ymin=0, ymax=25, xmin=0.5, xmax=14, alpha=.25, fill="red") +
  ggplot2::annotate("rect", ymin=25, ymax=50, xmin=0.5, xmax=14, alpha=.5, fill="orange") +
  ggplot2::annotate("rect", ymin=50, ymax=75, xmin=0.5, xmax=14, alpha=.4, fill="yellow") +
  ggplot2::annotate("rect", ymin=75, ymax=100, xmin=0.5, xmax=14, alpha=.3, fill="forestgreen") +
  #ggplot2::geom_point() +
  ggplot2::geom_jitter(width = 0.25)+
  ggplot2::geom_point(data = tbl %>%
                        dplyr::group_by(Category) %>%
                        dplyr::summarise(`FCSc.mean` = mean(`FCSc.mean`, na.rm=TRUE)), size = 5, colour = 'red') +
  ggplot2::ylab(label="Final conservation score (FCSc-mean)") + ggplot2::xlab("") +
  ggplot2::scale_y_continuous(breaks = seq(0, 100, 10)) +
  ggplot2::theme(panel.grid.major.x = element_blank(),
                 panel.grid.minor.x = element_blank(),
                 panel.grid.major.y = element_line(colour="grey60", linetype="dashed"),
                 axis.text.x  = element_text(size=15),
                 axis.text.y  = element_text(face="plain",size=15),
                 axis.title.x = element_text(face="bold",size=15),
                 axis.title.y = element_text(face="bold",size=15)) +
  ggplot2::annotate("text",
                    y        = c(12.5, 37.5, 62.5, 87.5),
                    x        = 13.6,
                    label    = c("HP","MP","LP","SC"),
                    colour   = "black",
                    size     = 4,
                    fontface = 2) +
  ggsave(paste0(root,"/associated_crop_type_specific.png"), device = "png", units = "in", width = 10, height = 6, dpi = 320)

# Associated.crop.common.name
tbl <- df %>%
  dplyr::select('Taxon','Associated.crop.common.name','FCSc.mean')
colnames(tbl)[2] <- 'Category'
lbls <- (tbl$Category %>% unique)[tbl$Category %>% unique %>% order(decreasing = T)]
tbl %>%
  ggplot2::ggplot(aes(x = factor(Category,levels=lbls), y = `FCSc.mean`, group = factor(Category,levels=lbls))) +
  ggplot2::theme_bw() +
  ggplot2::geom_blank() +
  ggplot2::coord_flip() +
  ggplot2::ylim(0, 100) +
  ggplot2::annotate("rect", ymin=0, ymax=25, xmin=0, xmax=53, alpha=.25, fill="red") +
  ggplot2::annotate("rect", ymin=25, ymax=50, xmin=0, xmax=53, alpha=.5, fill="orange") +
  ggplot2::annotate("rect", ymin=50, ymax=75, xmin=0, xmax=53, alpha=.4, fill="yellow") +
  ggplot2::annotate("rect", ymin=75, ymax=100, xmin=0, xmax=53, alpha=.3, fill="forestgreen") +
  #ggplot2::geom_point() +
  ggplot2::geom_jitter(width = 0.25)+
  ggplot2::geom_point(data = tbl %>%
                        dplyr::group_by(Category) %>%
                        dplyr::summarise(`FCSc.mean` = mean(`FCSc.mean`, na.rm=TRUE)), size = 5, colour = 'red') +
  ggplot2::ylab(label="Final conservation score (FCSc-mean)") + ggplot2::xlab("") +
  ggplot2::scale_y_continuous(breaks = seq(0, 100, 10)) +
  ggplot2::theme(panel.grid.major.x = element_blank(),
                 panel.grid.minor.x = element_blank(),
                 panel.grid.major.y = element_line(colour="grey60", linetype="dashed"),
                 axis.text.x  = element_text(size=15),
                 axis.text.y  = element_text(face="plain",size=15),
                 axis.title.x = element_text(face="bold",size=15),
                 axis.title.y = element_text(face="bold",size=15)) +
  ggplot2::annotate("text",
                    y        = c(12.5, 37.5, 62.5, 87.5),
                    x        = 52,
                    label    = c("HP","MP","LP","SC"),
                    colour   = "black",
                    size     = 4,
                    fontface = 2) +
  ggsave(paste0(root,"/associated_crop.png"), device = "png", units = "in", width = 10, height = 12, dpi = 320)


D:/cwrNA/src/test/select_categories_indicator.R 

#María Victoria Díaz
#CIAT,2018

# This function takes a list of the uses of the species, and calculate the proportion of species with that uses in all categories (HP, MP, LP, SC).
# The output is returned as a value.
# @param (string) usess: Vector list with the uses that is given to the species
# @param (string) opt: which field(s) to calculate indicator for (min, max, mean)
# @return (dataframe): This function returns a data frame with the indicator requested for the list of uses provided.

#base_dir = "//dapadfs" 
#source('D:/Repositorios/aichi13/src/config.R')
#source('D:/Repositorios/aichi13/src/3_indicator/indicator.R')
config(dirs=T)

#usess<-"Animal_Food"
indicator_cat <- function(usess, opt=c("min","max","mean","ex","in")){
  
uses_sp<<- read.csv(paste0(par_dir,"/uses/uses.csv"), sep=",", header=T)

spp_list <- uses_sp[which(uses_sp$USE.1 %in% usess | uses_sp$USE.2 %in% usess | uses_sp$USE.3 %in% usess | uses_sp$USE.4 %in% usess | uses_sp$USE.5 %in% usess | uses_sp$USE.6 %in% usess | uses_sp$USE.7 %in% usess ),]
spp_list<-as.character(unique(spp_list$Taxon_key))

spp_exist <- lapply(spp_list, FUN=function(x) {file.exists(paste(gap_dir,"/",x,"/",run_version,"/gap_analysis/combined/fcs_combined.csv",sep=""))})
spp_exist <- unlist(unlist(spp_exist))
spp_list <- spp_list[which(spp_exist)]


if (length(spp_list) == 0) {
  indic_df <- NA
} else {

  #calculate indicator for species list
  indic_df <- calc_indicator(spp_list, opt, save_file = F)
  
}

date = Sys.Date()

if(!file.exists(paste0(root,"/indicator/uses/",date))){dir.create(paste0(root,"/indicator/uses/",date))}


new_names<- ifelse(usess == "Food_Additives", "Food additives", 
                   ifelse(usess == "Animal_Food", "Animal foods",
                          ifelse(usess == "Bee_Plants", "Bee plants",
                                 ifelse(usess == "Environmental", "Environmental use plants",
                                        ifelse(usess == "Genetic_Sources", "Genetic sources",
                                               ifelse(usess == "Human_Food", "Human foods", 
                                                      ifelse( usess == "Medicine", "Medicines",
                                                              ifelse(usess == "Pesticide", "Pesticides",
                                                                     ifelse(usess == "Social", "Socially relevant plants", usess)))))))))


write.csv(indic_df, paste(root,"/indicator/uses/",date, "/ind_",new_names, ".csv",sep=""), row.names=F, quote=F)


return(indic_df)


}


D:/cwrNA/src/test/select_spp_indicator.R 

##########################################   Start Functions    ###############################################
# This function takes a list of countries, selects those species from the file
# "WEP_taxonkey_distribution.csv" and then runs the calc_indicator.R function, which in turn
# calculates the proportion of species in different categories (HP, MP, LP, SC).
# The output is returned as a data.frame.
# @param (string) iso_list: vector with list of country ISOs
# @param (string) opt: which field(s) to calculate indicator for (min, max, mean)
# @return (data.frame): This function returns a data frame proportions of spp in each category,
#                       and with final indicator aggregated for the selected country

select_spp_indicator <- function(iso_list="ALL", opt=c("min","max","mean","ex","in"), level) {
  #load global config
  config(dirs=T)
  
  if(level == "country"){
  #load list of species-by-country
  wep_list <- read.csv(paste(par_dir,"/WEP/WEP_taxonkey_distribution_ISO3.csv",sep=""),sep="\t",header=T)

  #select species following given filter
  if (toupper(iso_list) == "ALL") {
    spp_list <- unique(paste(wep_list$taxonkey))
  } else {
    
       if(iso_list == "NA"){
       
         spp_list <- wep_list[which(wep_list$ISO3 %in% "NAM"),] #ISO3  to generate Namibia indicator.
         spp_list <- unique(paste(spp_list$taxonkey))
       
     }else{
       
       spp_list <- wep_list[which(wep_list$ISO2 %in% toupper(iso_list)),] #ISO3  to generate Namibia indicator.
       spp_list <- unique(paste(spp_list$taxonkey))
       
     }
    
  }
  
  #filter above list of species following those that have fcs_combined.csv
  spp_exist <- lapply(spp_list, FUN=function(x) {file.exists(paste(gap_dir,"/",x,"/",run_version,"/gap_analysis/combined/fcs_combined.csv",sep=""))})
  spp_exist <- unlist(unlist(spp_exist))
  spp_list <- spp_list[which(spp_exist)]
  
  
  if (length(spp_list) == 0) {
    indic_df <- NA
  } else {
    #calculate indicator for species list
    
    indic_df <- calc_indicator(spp_list, opt, save_file = F)
    
  }
  
  #return object

  #date = Sys.Date()
  date <- readline(prompt="Date of the indicator's calculation YYYY-MM-D (i.e: 2019-04-24 )") 
  
  
  if(!file.exists(paste0(root,"/indicator/countries/",date))){dir.create(paste0(root,"/indicator/countries/",date))}
  
  
  if(iso_list == "ALL"){

    write.csv(indic_df, paste(root,"/indicator/ALL/indicator_ALL_",date, ".csv",sep=""), row.names=F, quote=F)
    
  }else{ if(iso_list =="NA" ){
    
    
    write.csv(indic_df, paste(root,"/indicator/countries/",date, "/ind_NA.csv",sep=""), row.names=F, quote=F)
    
  }else{
    
    write.csv(indic_df, paste(root,"/indicator/countries/",date, "/ind_",iso_list, ".csv",sep=""), row.names=F, quote=F)
    
  }
    
    
  }
  
  
  return(indic_df)
  
  }else{
    
    
    wep_list <- read.csv(paste(par_dir,"/UNSD/subregions.csv",sep=""),sep=",",header=T)
    
    #select species following given filter
    spp_list <- wep_list[which(wep_list$SUBREGIONS %in% iso_list),]
    spp_list <- unique(paste(spp_list$taxonkey))
    
    #filter above list of species following those that have fcs_combined.csv
    spp_exist <- lapply(spp_list, FUN=function(x) {file.exists(paste(gap_dir,"/",x,"/",run_version,"/gap_analysis/combined/fcs_combined.csv",sep=""))})
    spp_exist <- unlist(unlist(spp_exist))
    spp_list <- spp_list[which(spp_exist)]
    
    if (length(spp_list) == 0) {
      indic_df <- NA
    } else {
      #calculate indicator for species list
      
      indic_df <- calc_indicator(spp_list, opt, save_file = F)
      
    }
    
    #return object
    
    date = Sys.Date()
    
    if(!file.exists(paste0(root,"/indicator/subregions/",date))){dir.create(paste0(root,"/indicator/subregions/",date))}
    
    
    write.csv(indic_df, paste(root,"/indicator/subregions/",date, "/ind_",iso_list, ".csv",sep=""), row.names=F, quote=F)
      
    
    #return object
    return(indic_df)
    
    
  }
}

##########################testing function ########################################################################################


#AS,NF ARE IN NA

##Generate Namibia indicator.##
#iso_list <- "NAM"
#indic_iso <- select_spp_indicator(iso_list, opt=c("min","max","mean","ex","in"))
#write.csv(indic_iso,paste0("//dapadfs/Workspace_cluster_9/Aichi13/indicator/countries/2018-11-16/NA","_",Sys.Date(),".csv"),row.names=F, quote=F)
D:/cwrNA/src/test/storageContent.R 

###
# storage for notes and useful one time code chunks during development 
# 20200414
# dan.carver@carverd.com
### 


# there are duplicates in the df counts data. removing them here 
dfC <- dfCounts%>% dplyr::distinct(species, .keep_all = TRUE)
#write out dfCounts 
write.csv(x = dfC, file = paste0(base_dir, "/runSummaries/allCounts.csv"))


#20200326 
# code for generating all species map

# if(length(speciesList) >=1){
#DANGER ---- delete existing model results
# for(name in 1:length(speciesList)){
#   path <- paste0(gap_dir,"/",genus, "/", speciesList[name],"/" , run_version)
#   unlink(x = path,recursive = TRUE)
# }
# speciesList <- speciesList[!speciesList %in% rmSpec]
# for(k in speciesList){
#   files <- list.files(path = paste0(gap_dir,"/", genus,"/",k,"/",run_version, "/"),
#                       pattern = "spdist_thrsld_median.tif",
#                       recursive = TRUE, full.names = TRUE)
#   if(length(files)>0){
#     vector[n] <- k
#     n = n+1
#   }
# }
#write.csv(x = vector, file = paste0(base_dir, "/runSummaries/speciesWithMaps.csv"))
### the vector list generated here is then used to run the summary of run function once all no cumulative species are removed


#20200313 
#useful code storing here at the moment 
# for(k in speciesList){
#   files <- list.files(path = paste0(gap_dir,"/", genus,"/",k,"/",run_version, "/"),
#                       pattern = "cleanedModelingData.csv",
#                       recursive = TRUE, full.names = TRUE)
#   if(length(files) == 1){
#     t5 <- read.csv(files[1])
#     if(ncol(t5)==10){
#       df5 <- rbind(df5,t5)
#     }
#     if(ncol(t5) == 9){
#       t5$StateTest = NA
#       df5 <- rbind(df5,t5)
#     }
#     if(class(files)=="character"){
#       t5 <- data.frame(matrix(nrow = 1, ncol=10))
#       colnames(t5) <-  c(
#         "taxon","latitude", "longitude","type","databaseSource", 
#         "hasLat","hasLong","hasLatLong","iso3_check"
#       ) 
#       t5$taxon=k
#       t5$latitude=NA 
#       t5$longitude=NA
#       t5$type=NA
#       t5$databaseSource=NA
#       t5$hasLat=NA
#       t5$hasLong=NA
#       t5$hasLatLong=NA
#       t5$iso3_check =NA
#     }
#   }
# }


#### 20200210 - I'd like to move this to either a 
# write out model/not modeled lists 

lowOccurenceAll1 <- lapply(lowOccurenceAll, function(x) x[!is.na(x)])
lowOccurenceAll1 <- lowOccurenceAll1[lengths(lowOccurenceAll1) > 0]

notModeledAll1 <- lapply(notModeledAll, function(x) x[!is.na(x)])
notModeledAll1 <- notModeledAll1[lengths(notModeledAll1) > 0]

ModeledAll1 <- lapply(ModeledAll, function(x) x[!is.na(x)])
ModeledAll1 <- ModeledAll1[lengths(ModeledAll1) > 0]


maxLength <- max(c(length(lowOccurenceAll1), 
                   length(notModeledAll1),
                   length(ModeledAll1)))

df2 <- data.frame(matrix(ncol=3, nrow= maxLength,data = NA))
colnames(df2) <- c("lowOccurrenceSpecies",
                   "speciesNotModeled",
                   "speciesSuccessfullyModeled"
)

lo2 <- c(lowOccurenceAll1, rep(NA, nrow(df2)-length(lowOccurenceAll1)))
nMA <- c(notModeledAll1, rep(NA, nrow(df2)-length(notModeledAll1)))
sSM <- c(ModeledAll1, rep(NA, nrow(df2)-length(ModeledAll1)))

for(i in 1:length(lo2)){
  df2$lowOccurrenceSpecies[i] <- lo2[[i]]
  df2$speciesNotModeled[i] <- nMA[[i]]
  df2$speciesSuccessfullyModeled[i] <- sSM[[i]]
}


write.csv(x = df2, file = paste0(gap_dir, "/summaryDocs/speciesModeledAndNot", Sys.Date(),".csv"))

### combine the all species list, with counts of North American, then the columns of the bins 
# list of all species names of interest 
x1 <- read.csv("D:/cwrNA/parameters/statePerTaxon/CWRofUSA_nativeareas_2020_1_30.csv")
x1 <- as.data.frame(unique(x1$name))

#read in counts for all species
x2 <- read.csv("D:/cwrNA/parameters/occurenceData/allCountsSummary.csv") %>%
  dplyr::select(species, NorthAmericanPoint)
## for some reason some species are making it on to this list twice. Dropping them here 
x2 <- x2[!duplicated(x2$species),]
x2 <- distinct(x2) 

# join x1 and x2 by species name 
t1 <- dplyr::left_join(x = x1, y = x2, by = c("unique(x1$name)" = "species"))
dim(t1) # this is about 14 values more, meaning there are some 

## there might be a better way to do this but I want to move on here 
l2 <- as.data.frame(df2$lowOccurrenceSpecies[!is.na(df2$lowOccurrenceSpecies)]) %>%
  dplyr::mutate(lowOcc = 1) %>%
  distinct()
l3 <- as.data.frame(df2$speciesNotModeled[!is.na(df2$speciesNotModeled)]) %>%
  dplyr::mutate(notModeled = 1)%>%
  distinct()
l4 <- as.data.frame(df2$speciesSuccessfullyModeled[!is.na(df2$speciesSuccessfullyModeled)]) %>%
  dplyr::mutate(ModeledSuccessfully = 1) %>%
  distinct()


t2 <- dplyr::left_join(x = t1, y = l2, by = c("unique(x1$name)" = "df2$lowOccurrenceSpecies[!is.na(df2$lowOccurrenceSpecies)]"))
t2 <- dplyr::left_join(x = t2, y = l3, by = c("unique(x1$name)" = "df2$speciesNotModeled[!is.na(df2$speciesNotModeled)]"))
t2 <- dplyr::left_join(x = t2, y = l4, by = c("unique(x1$name)" = "df2$speciesSuccessfullyModeled[!is.na(df2$speciesSuccessfullyModeled)]"))

write.csv(x = t2, file = paste0(gap_dir, "/summaryDocs/modelErrorsSummary", Sys.Date(),".csv"))


## base directory is getting redefined to gap_dir somewhere. 

# this is a big process it will take time... 
try(rmarkdown::render(paste0(repo_dir, "/summaryMarkdown/summaryOfRun.rmd"),  # file 2
                      output_file =  paste("SummaryReport_", run_version , Sys.Date(), ".html", sep=''),
                      output_dir = paste0(base_dir,"/runSummaries")))


#### troubleshooting content, most species list 
#rerunning intraspecific species due to occurrence compiled from species level 
# spList <- c(
"Leymus salina subsp. salina",
"Leymus salina subsp. salmonis",
"Persea palustris",
"Vaccinium crassifolium subsp. crassifolium",
"Elymus glabriflorus var. australis",
"Elymus glabriflorus var. glabriflorus",
"Elymus glaucus subsp. mackenziei",
"Ipomoea cordatotriloba var. cordatotriloba",
"Juglans major var. major",
"Juglans microcarpa var. microcarpa",
" Prunus virginiana var. demissa",
"Ribes cereum var. cereum",
"Rubus ursinus subsp. macropetalus",
"Rubus ursinus subsp. ursinus",
"Tripsacum dactyloides var. dactyloides")

# # species for issues 3, delete current folders and rerun completely 
# # test with Vanilla mexicana before deleting any more runs 
# spList <- c("Vanilla mexicana",
"Acer saccharum subsp. ozarkense",
"Rubus abactus",
"Rubus ostryifolius",
#             "Fragaria x ananassa",
"Artocarpus altilis",
"Lactuca ludoviciana",
"Psidium guajava")

# Species for issue 4, try re running the models... watch how they fail. It could be there are just
# not enough points
# spList <- c("Allium bigelovii",
"Leymus salina subsp. mojavensis",
"Rubus kennedyanus",
"Vaccinium crassifolium subsp. crassifolium",
#             "Fragaria chiloensis subsp. sandwicensis",
"Juglans jamaicensis",
"Helianthus verticillatus",
#             "Ipomoea littoralis",
"Rubus x neglectus",
"Helianthus praecox subsp. praecox",
"Manihot walkerae",
"Rubus arundelanus",
"Allium gooddingii",
#             "Rubus orarius",
"Solanum nelsonii",
"Vitis aestivalis var. linsecomii",
"Elymus stebbinsii subsp. stebbinsii",
"Rubus neglectus",
"Elymus glabriflorus var. australis",
"Helianthus arizonensis",
#             "Vitis x novae-angliae",
"Elymus interruptus",
"Ipomoea dumetorum",
"Helianthus debilis subsp. tardiflorus",
"Elymus glabriflorus var. glabriflorus",
"Xanthosoma sagittifolium",
#             "Juglans cinerea",
"Prunus andersonii","Ribes quercetorum",
"Gossypium hirsutum"
# )

######
# species that had issues with redlist values 

List <- c(  "Malus fusca", 
            "Vaccinium uliginosum", 
            "Leymus mollis",
            "Vaccinium vitis-idaea", 
            "Ribes triste", 
            "Vigna luteola", 
            "Gossypium hirsutum", 
            "Vaccinium ovalifolium var. ovalifolium",
            "Xanthosoma sagittifolium", 
            "Psidium guajava", 
            "Ipomoea cordatotriloba",  
            "Ribes howellii",
            "Acer saccharum subsp. ozarkense", 
            "Oryza latifolia",
            "Ipomoea dumetorum",
            "Elymus trachycaulus",
            "Elymus glaucus",
            "Ipomoea leucantha",
            "Juglans jamaicensis",
            "Ribes aureum",
            "Ribes lacustre",
            "Vitis aestivalis var. linsecomii",
            "Allium cernuum",
            "Allium textile",
            "Leymus x multiflorus",
            "Leymus salina subsp. salina",
            "Hordeum jubatum",
            "Prunus emarginata",
            "Leymus cinereus",
            "Ribes cereum",
            "Nicotiana obtusifolia",
            "Leymus ambiguus",
            "Leymus simplex",  
            "Carya illinoinensis", 
            "Leymus innovatus", 
            "Leymus pacificus", 
            "Lemus californicus", 
            "Vaccinium scoparium",  
            "Rubus ursinus",  
            "Rubus spectabilis",
            "Rubus orarius",  
            "Rubus nutkanus", 
            "Rubus hispidus var. obovalis", 
            "Rubus chamaemorus",  
            "Vaccinium crassifolium subsp. sempervirens", 
            "Solanum xanti", 
            "Daucus pusillus",  
            "Fragaria vesca", 
            "Dasiphora fruticosa", 
            "Elymus canadensis", 
            "Elymus glabriflorus", 
            "Allium gooddingii", 
            "Elymus glaucus subsp. glaucus",  
            "Solanum douglasii",  
            "Leymus salina", 
            "Helianthus annuus", 
            "Prunus virginiana var. demsa",
            "Hordeum brachyantherum", 
            "Elymus elymoides",
            "Pseudoroegneria spicata",
            "Juglans microcarpa",
            "Allium bigelovii", 
            "Vaccinium erythrocarpum subsp. erythrocarpum",
            "Fragaria x ananassa", 
            "Fragaria virginiana", 
            "Juglans major",  
            "Leymus mollis subsp. villosissimus",
            "Tripsacum dactyloides", 
            "Allium schoenoprasum subsp. schoenoprasum",
            "Allium schoenoprasum", 
            "Fragaria x bringhurstii", 
            "Phaseolus leptostachyus var. leptostachyus", 
            "Psidium longipes",  "Rubus abactus", 
            "Saccharum brevibarbe var. brevibarbe", 
            "Vaccinium crassifolium", 
            "Vaccinium erythrocarpum",  
            "Vaccinium ovalifolium",  
            "Vanilla mexicana"
            
)

D:/cwrNA/src/test/testingDataFromDifferentSources.R 


march <- data.table::fread("D:/cwrNA/occurrence_data2019_05_29/combinedOccurance2020-03-18.csv", header = TRUE)
dim(march)
april <- data.table::fread("D:/cwrNA/occurrence_data2019_05_29/combinedOccurance2020-04-07.csv")
april <- april[,2:ncol(april)]

jan <- data.table::fread("D:/cwrNA/occurrence_data2019_05_29/combinedOccurance2020-01-11.csv",
                         header = TRUE)


lM <- march %>%
  dplyr::filter(taxon == "Pseudoroegneria spicata")
View(lM)
lA <- april %>%
  dplyr::filter(taxon == "Leymus californicus")
View(lA)

lJ <- jan %>%
  dplyr::filter(taxon == "Pseudoroegneria spicata")
View(lJ)
D:/cwrNA/src/test/troubleshootingCounts.R 

###
# generating all summary data metrics 
# 20200408 
# dan.carver@carverd.com 
### 


# occurrence data from data prep step 
d1 <- read.csv("D:/cwrNA/parameters/USA_cropWildRelativeInventory/CWRofUSA_Inventory_2020_02_26.csv", header = TRUE)
fL <- d1 %>% dplyr::select(Taxon_GRIN.Global_2019.final)

# pull in original data from the project 
CWRuslist <- read.csv("D:/cwrNA/speciesList/CWRoftheUSA_synonyms20191114.csv")
tL <- CWRuslist %>% dplyr::select(Taxon_GRIN.Global_2019.final)

# join to select species that are on both list 
fullList <- dplyr::inner_join(x = fL, y= tL, by = "Taxon_GRIN.Global_2019.final") %>%
  dplyr::distinct()

#genera List 
genera <- sort(unique(occData$genus))
# set run version 
run_v <- "test20200203"
## pull counts, gap analysis scores, and redlist scores from the model run folder 

# create empty dataframe for counts CSV 
df1 <- data.frame(matrix(nrow = nrow(fullList), ncol = 1))
colnames(df1) <- "species"
df1$species <- as.character(sort(fullList$Taxon_GRIN.Global_2019.final))
# loop over all species append values 
n=1
for(i in genera){
  #select all species in genera 
  oc1 <- occData %>% 
    dplyr::filter(genus == i)
  spList2 <- df1[grep(pattern = i,x = df1$species),]
  for(j in spList2){
    sp_dir <- paste0("D:/cwrNA/gap_analysis/",i,"/",j,"/",run_v)
    # test for file and read it as object 
    if(file.exists(paste0(sp_dir, "/counts.csv"))){
      ct <- read.csv(paste0(sp_dir, "/counts.csv"))
      if(n==1){
        ctAll <- ct 
      }else{
        ctAll <- dplyr::bind_rows(ctAll, ct)
      }
    }
    if(file.exists(paste0(sp_dir, "/gap_analysis/exsitu/summary.csv"))){
      gE <- read.csv(paste0(sp_dir, "/gap_analysis/exsitu/summary.csv"))
      #assign classes 
      if (gE$FCS < 25) {
        gE$Exsitu_Score <- "HP"
      } else if (gE$FCS >= 25 & gE$FCS < 50) {
        gE$Exsitu_Score <- "MP"
      } else if (gE$FCS >= 50 & gE$FCS < 75) {
        gE$Exsitu_Score <- "LP"
      } else {
        gE$Exsitu_Score <- "SC"
      }
      if(n==1){
        gEAll <- gE 
      }else{
        gEAll <- dplyr::bind_rows(gEAll, gE)
      }
    }
    if(file.exists(paste0(sp_dir, "/gap_analysis/insitu/summary.csv"))){
      gI <- read.csv(paste0(sp_dir, "/gap_analysis/insitu/summary.csv")) %>%
        dplyr::select("ID","SRS.NTOTAL",	"SRS.ProTotal","SRS.SRS","SRS.SRS","GRS","ERS","FCS")
      if (gI$FCS < 25) {
        gI$Insitu_Score <- "HP"
      } else if (gI$FCS >= 25 & gI$FCS < 50) {
        gI$Insitu_Score <- "MP"
      } else if (gI$FCS >= 50 & gI$FCS < 75) {
        gI$Insitu_Score <- "LP"
      } else {
        gI$Insitu_Score <- "SC"
      }
      if(n==1){
        gIAll <- gI 
      }else{
        gIAll <- dplyr::bind_rows(gIAll, gI)
      }
    }
    if(file.exists(paste0(sp_dir, "/gap_analysis/combined/fcs_combined.csv"))){
      gF <- read.csv(paste0(sp_dir, "/gap_analysis/combined/fcs_combined.csv"))
      if(n==1){
        gFAll <- gF 
      }else{
        gFAll <- dplyr::bind_rows(gFAll, gF)
      }
    }
    if(file.exists(paste0(sp_dir, "/gap_analysis/redList/listingValues4kmClean.csv"))){
      rL <- read.csv(paste0(sp_dir, "/gap_analysis/redList/listingValues4kmClean.csv"))
      if(n==1){
        rlAll <- rL 
      }else{
        rlAll <- dplyr::bind_rows(rlAll, rL)
      }
    }
    n= n+1
  }
  print(paste0(i, " have been compiled"))
}

# generate the combined score for redList Values 
rlAll$aVal <- NA
rlAll$eVal <- NA
# add numeric values based on status 
for(i in 1:nrow(rlAll)){
  if (rlAll$AOO.Status[i] == "Least Concern (LC)"){
    rlAll$aVal[i] <- 1 }
  if (rlAll$AOO.Status[i] == "Possible Near Threatened (NT)"){
    rlAll$aVal[i] <- 2 }
  if (rlAll$AOO.Status[i] == "Vulnerable (VU)"){
    rlAll$aVal[i] <- 3 }
  if (rlAll$AOO.Status[i] == "Endangered (EN)"){ 
    rlAll$aVal[i] <- 4 }
  if (rlAll$AOO.Status[i] == "Critically Endangered (CR)"){
    rlAll$aVal[i] <- 5 }
  # EOO values 
  if (rlAll$EOO.Status[i] == "Least Concern (LC)"){
    rlAll$eVal[i] <- 1 }
  if (rlAll$EOO.Status[i] == "Possible Near Threatened (NT)"){
    rlAll$eVal[i] <- 2 }
  if (rlAll$EOO.Status[i] == "Vulnerable (VU)"){
    rlAll$eVal[i] <- 3 }
  if (rlAll$EOO.Status[i] == "Endangered (EN)"){ 
    rlAll$eVal[i] <- 4 }
  if (rlAll$EOO.Status[i] == "Critically Endangered (CR)"){
    rlAll$eVal[i] <- 5 }

  if(rlAll$eVal[i] >= rlAll$aVal[i]){
    stat <- rlAll$EOO.Status[i]
  }else{
    stat <- rlAll$AOO.Status[i]
  }
  
  rlAll$`Combined Status`[i] <- stat
}

rlAll <- rlAll %>% dplyr::select(c("taxon","EOO.Area.km2","EOO.Status","AOO",
                                    "AOO.adjusted.Minimum","AOO.Status","Combined Status"))
# join based on full specices list to identify non present species 
ctFull <- dplyr::full_join(x = df1,y=ctAll, by = "species")

gEFull <- dplyr::full_join(x = ctFull,y=gEAll, by = c("species" = "ID"))

gIFull <- dplyr::full_join(x = gEFull,y=gIAll, by =  c("species" = "ID"))

gFFull <- dplyr::full_join(x = gIFull,y=gFAll, by =  c("species" = "ID"))

allSummary <- dplyr::full_join(x = gFFull,y=rlAll, by = c("species" = "taxon"))


# add field based on if speciecs will be included in the higher level analysis 
noSS <- c("Phaseolus acutifolius","Phaseolus leptostachyus","Elymus elymoides","Leymus mollis","Phaseolus maculatus","Hordeum jubatum","Helianthus petiolaris","Ribes sanguineum","Phaseolus polystachios","Prunus serotina","Elymus trachycaulus","Hordeum brachyantherum","Ribes roezlii","Rubus hispidus","Ribes hudsonianum","Helianthus nuttallii","Helianthus pauciflorus","Humulus lupulus","Allium geyeri","Ribes oxyacanthoides","Fragaria x ananassa","Helianthus occidentalis","Fragaria virginiana","Elymus lanceolatus","Fragaria vesca","Helianthus niveus","Helianthus praecox","Prunus fasciculata","Ribes malvaceum","Rubus arcticus","Vitis rotundifolia","Fragaria chiloensis","Ribes aureum","Acer saccharum","Allium victorialis","Elymus stebbinsii","Helianthus debilis","Ipomoea ternifolia","Lactuca tatarica","Prunus ilicifolia","Prunus pumila","Ribes californicum","Rubus idaeus","Saccharum brevibarbe","Vitis aestivalis","Vitis cinerea","Zizania aquatica","Zizania palustris", "Allium schoenoprasum","Elymus glabriflorus",
          "Elymus glaucus","Ipomoea cordatotriloba","Juglans major","Juglans microcarpa","Leymus salina","Prunus virginiana","Ribes cereum","Rubus ursinus","Tripsacum dactyloides","Vaccinium crassifolium","Vaccinium erythrocarpum","Vaccinium ovalifolium"
)
allSummary$`Included in Summaries` <- !allSummary$species %in% noSS 


# change column names 
allSummary <- allSummary %>% dplyr::select(-Exsitu_Score)


View(allSummary)
#drop NA row 
allSummary <- allSummary[2:nrow(allSummary),]

newCols <- c("Species",
  "Total Records",	"Records with latitude",	"Records with longitude",	
  "Records with coordinates",
 " Total G records",	'Total G records with coordinates',	
 "Total H Records",	"Total H with coordinates",
 "Number of unique data sources",
 "Total occurrences in North America",	"Total G occurrences in North America",
 "Total H occurrences in North America",	
 "SRSex",	"GRSex",	"ERSex",	"FCSex", "Exsitu Conservation Score",
 "Total occurrences in modeled area",	
 "Total occcurrens in modeled area in protected areas",
 "SRSin",	"GRSin",	"ERSin",	"FCSin", "Insitu Conservation Score",	
 
 "FCSex_value",	"FCSin_value",	
 
 "FCSc_min",	"FCSc_max",	"FCSc mean",
 
 "FCSc_min priority category",	"FCSc_max priority category",	
 "FCSc mean priority category",
 
 "EOO area km2", "EOO status",	"AOO",	"AOO adjusted minimum",	"AOO status",
 "Combined status", "Included in Summaries"
)

colnames(allSummary) <- newCols

write.csv(x = allSummary, file = paste0("D:/cwrNA/runSummaries/allMetricData", Sys.Date(), ".csv"))


# :) run from here

# inport the CWR inventory and join the priority level and the crop type for futher summaries 
cwrIn <- d1 %>% dplyr::select("Taxon_GRIN.Global_2019.final",
                              "Crop.or.WUS.use_general", "Priority.2019",
                              "Crop.or.WUS.use_1", 
                              "Genus",
                              "Associated_crop_common.name")
colnames(cwrIn) <- c(
  "name", 
  "Associated crop type general",
  "Category",
  "Associated crop type specific",
  "Genus",
  "Associated crop"
)


cwrIn$name <- as.character(cwrIn$name) 
useGroup <- dplyr::left_join(x = allSummary ,y= cwrIn, by= c("Species" = "name"))
View(useGroup)
# drop NA columns 
#useGroup <- useGroup[useGroup$species != "",]
#write.csv(x = useGroup, file = paste0("D:/cwrNA/runSummaries/allMetricData", Sys.Date(), ".csv"))

### adding the median model data to this data to double check true model runs for each species 
allM <- read.csv("D:/cwrNA/runSummaries/median_summary_test20200203.csv")
dt2 <- dplyr::left_join(useGroup, allM, by = c("Species" = "species"))


### generate content for figures 
dFig <- dt2[,c(1,42,43,44,45,14,15,16,17,18,21:33,35,36,38,39)]
colnames(dFig) <- c("Taxon",	"Category",	"Associated crop type general",	"Associated crop type specific",
"Genus",	"Associated crop",	"SRSex",	"GRSex",	"ERSex",	"FCSex",	"FCSex priority category",
"SRSin",	"GRSin",	"ERSin",	"FCSin",	"FCSin priority category",
"FCSex value",	"FCSin value",
"FCSc_min",	"FCSc_max",	"FCSc mean",	"FCSc_min priority category",
"FCSc_max priority category",	"FCSc mean priority category",	"EOO status",
"AOO status",	"Combined threat assessment status")
write.csv(x = dFig, file = paste0("D:/cwrNA/runSummaries/allMetricData_ForFigures", Sys.Date(), ".csv"))


write.csv(x = dt2, file = paste0("D:/cwrNA/runSummaries/allMetricData_withRuns", Sys.Date(), ".csv"))


dt2a <- dt2 %>%
  dplyr::filter("Total occurrences in North America" >= 25)%>%
  group_by("Included in Summaries", "Valid")%>%
  dplyr::summarise()
View(dt2a)
# 20200225 
# pulling data for intraspecific species 
spList2 <- 
iSL <- occData %>%
  filter(taxon %in% spList2)

a <- iSL %>%
  dplyr::group_by(taxon)%>%
  dplyr::summarise(count = n())

sort(unique(iSL$taxon))
View(iSL)
write.csv(x = iSL, file = "D:/cwrNA/troubleshooting/intraSpecificSpecsList2.csv")

### pre 20200224


spCount <- occData %>%
  group_by(taxon, type)%>%
  dplyr::summarise(count = n())
write.csv(x = spCount, file = "D:/temp/spOccurrenceCount.csv")


# list on all known species 
d1 <- read.csv("D:/cwrNA/speciesList/CWRoftheUSA_synonyms20191114.csv", header = TRUE)
fullList <- unique(d1$Taxon_GRIN.Global_2019.final)
write.csv(x = fullList, file = "D:/temp/speciesInCWRlist.csv")

# find missing species 
missingSpecies <- fullList[!fullList %in% spList2]

# species present in occurence data that are not listed in CWR list 
extraSpecies <- spList2[!spList2 %in% fullList]