# install.packages("R.utils")
# install.packages("devtools")

library(caret)
library(pscl)
library(plotROC)
library(pROC)
library(sf)
library(tidyverse)
library(knitr)
library(kableExtra)
library(tigris)
library(ggthemes)
library(rgdal)
library(raster)
library(rgeos)
library(viridis)
library(ggmap)
library(gridExtra)
library(raster)
library(extrafont)
library(na.tools)

  
  q5 <- function(variable) {
    as.factor(ntile(variable, 5))}
pal<- c("#ffa600", "#58508d")

mapTheme <- theme(plot.title =element_text(size=12),
                  plot.subtitle = element_text(size=8),
                  plot.caption = element_text(size = 6),
                  axis.line=element_blank(),
                  axis.text.x=element_blank(),
                  axis.text.y=element_blank(),
                  axis.ticks=element_blank(),
                  axis.title.x=element_blank(),
                  axis.title.y=element_blank(),
                  panel.background=element_blank(),
                  panel.border=element_blank(),
                  panel.grid.major=element_line(colour = 'transparent'),
                  panel.grid.minor=element_blank(),
                  legend.direction = "vertical", 
                  legend.position = "right",
                  plot.margin = margin(1, 1, 1, 1, 'cm'),
                  legend.key.height = unit(1, "cm"), legend.key.width = unit(0.2, "cm"))
# install.packages("vembedr")

library("vembedr")

embed_url("https://youtu.be/T7d659Qu3lw")

INTRODUCTION


Link to Video Presentation
Flooding is devastating and costly for homeowners, business owners and city governments alike. With the increasing severity of the climate crisis, a First Street study estimates economic damage due to flooding will grow over the next 30 years by 61 percent, to an average estimated annual loss of $7,563 per property — for an estimated total loss of $32.3 billion 1. The motivation for this analysis is to train, test and validate a predictive model using Calgary, AB data to then use it to predict flood risk for Pittsburgh, PA.

Calgary Data

We first download a variety of data from the Calgary Open Data Portal.

Citywide Land Cover - A vector shapefile that describes permeable, impermeable, natural cover and storm water surface area.

Hydrology - A vector shapefile of bodies of water in Calgary, including streams and lakes.

Calgary DEM - Digital elevation model - a raster file of Calgary elevation values.

Flood Inundation - a raster file describing observed historical flood events in Calgary.

calgary_bound<- st_as_sf(st_read('Data/Calgary/City_Boundary.csv'), wkt = "MULTIPOLYGON", crs = 4326)%>%
  st_transform('EPSG:3401')

fishnet<- st_as_sf(st_read('Data/Calgary/fishnet1.shp'), crs = 4326)%>%
  st_transform('EPSG:3401')

flood<- st_as_sf(st_read('Data/Calgary/Flood_Raster.shp'), crs = 4326)%>%
  st_transform('EPSG:3401')

#Dependent Variable
flood_tbl<- read.csv('Data/Calgary/flood_tbl.csv')%>%
  dplyr::select(uniqueID, MAJORITY)%>%
  rename(FLOOD_INNUN = MAJORITY)

#Predictors
permeable<- read.csv('Data/Calgary/permeable.csv')%>%
  dplyr::select(uniqueID, MAJORITY)%>%
  rename(PERMEABLE =MAJORITY)
perm_sum<- read.csv('Data/Calgary/perm_sum.csv')%>%
  dplyr::select(uniqueID, SUM)%>%
  rename(PERMEABLE =SUM)
imperm_sum<- read.csv('Data/Calgary/imperm_sum.csv')%>%
  dplyr::select(uniqueID, SUM)%>%
  rename(IMPERMEABLE= SUM)
stream_sum<- read.csv('Data/Calgary/stream_sum1.csv')%>%
  dplyr::select(uniqueID, SUM)%>%
  rename(STREAM = SUM)
lake_sum<- read.csv('Data/Calgary/lake_sum.csv')%>%
  dplyr::select(uniqueID, SUM)%>%
  rename(LAKE= SUM)
natcov_sum<- read.csv('Data/Calgary/natcov_sum.csv')%>%
  dplyr::select(uniqueID, SUM)%>%
  rename(NATCOV= SUM)
elevation_cat <- read.csv('Data/Calgary/elevation_cat.csv')%>%
  dplyr::select(uniqueID, MIN)%>%
  rename(ELEVATION_CAT= MIN)
dist_slope <- read.csv('Data/Calgary/distz_to_slope.csv')%>%
  dplyr::select(uniqueID, MAX)%>%
  rename(DIST_SLOPE= MAX)
slope <- read.csv('Data/Calgary/slope.csv')%>%
  dplyr::select(uniqueID, MAX)%>%
  rename(SLOPE= MAX)
elevation <- read.csv('Data/Calgary/elev.csv')%>%
  dplyr::select(uniqueID, MAX)%>%
  rename(ELEVATION= MAX)
LC <-  read.csv('Data/Calgary/LC.csv')%>%
  dplyr::select(uniqueID, MAJORITY)%>%
  rename(LANDCOVER= MAJORITY)
watershed<-  read.csv('Data/Calgary/watershed.csv')%>%
  dplyr::select(uniqueID, MAJORITY)%>%
  rename(WATERSHED= MAJORITY)
watershed_area<-  read.csv('Data/Calgary/watershed_area.csv')%>%
  dplyr::select(VALUE, AREA)%>%
  rename(WATERSHED= VALUE)
watershed<- left_join(watershed, watershed_area, by = "WATERSHED")%>%
  mutate(WATERSHED_AREA = AREA)
#join variables together by uniqueID
vars <- full_join(flood_tbl,natcov_sum, by = "uniqueID")%>%
  full_join(., perm_sum, by= "uniqueID")%>%
  full_join(., imperm_sum, by= "uniqueID")%>%
  full_join(., stream_sum, by= "uniqueID")%>%
  full_join(., lake_sum, by = "uniqueID")%>%
  full_join(., elevation_cat, by = "uniqueID")%>%
  full_join(., dist_slope, by = "uniqueID")%>%
  full_join(., slope, by = "uniqueID")%>%
  full_join(., elevation, by = "uniqueID")%>%
  full_join(., LC, by = "uniqueID")%>%
  full_join(., watershed, by = "uniqueID")
  

#join variables to the fishnet
#drop na will remove cells that dont have observations for dependent variable

fishnet_vars<- merge(fishnet, vars, by = "uniqueID")%>%
  drop_na(FLOOD_INNUN)


#engineer some features
#continuous vs majority/minority
fishnet_vars<- 
fishnet_vars%>%
  mutate( MAJ_PER = ifelse(PERMEABLE ==1, 1, 0),
          MAJ_NAT = ifelse(NATCOV >= 50, 1, 0),
          MAJ_IMP = ifelse(IMPERMEABLE >= 50, 1, 0),
          MAJ_STR = ifelse(STREAM >= 1, 1, 0),
          MAJ_LAK = ifelse(LAKE >= 1, 1, 0))

We create a ‘fishnet’ across Calgary’s geography that is made of 200m x 200m cells. The fishnet allows us to view and analyze our data and make predictions consistently across space.The fishnet crops area within Calgary’s border because we are limited to the extent of our variables including the digital elevation model and the flood inundation observation. The maps below visualize our data once they are joined to the fishnet layer.

grid.arrange(
  
(ggplot() +
  geom_sf(data = calgary_bound,
          fill= 'white')+
  geom_sf(data=flood, 
          fill = "dark blue", 
          color = "dark blue",
          alpha = 0.6) +
  labs(title="Flood Events in Calgary") + 
   theme_minimal()+
  theme(axis.line=element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        axis.title.x=element_blank(),
        axis.title.y=element_blank())),

(ggplot()+
  geom_sf(data = fishnet_vars, aes(fill = as.factor(FLOOD_INNUN)), color = 'grey35')+
  scale_fill_manual(values = c('#FFFFFF', '#6993f5'),
                    labels = c("No Flood", "Flood"),
                    name = " ")+
  labs(title = 'Flood Events in Calgary | Fishnet')+
  theme_minimal()+
  theme(axis.line=element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        axis.title.x=element_blank(),
        axis.title.y=element_blank())), nrow=1)

# cal_basemap <- get_map(location=c(lon = -114.0672670749499, lat = 51.04024248460027), zoom=11, maptype = 'terrain-background', source = 'stamen')
grid.arrange (
  
( ggplot()+
  geom_sf(data = fishnet_vars%>%na.omit, 
          aes(fill = factor(WATERSHED)), 
          color= NA)+
  scale_fill_viridis(option = 'A',
                     direction =1,
                     discrete= T, 
                     name = "Watershed")+
  labs(title = "Calgary Watersheds")+
  theme_minimal()+
  theme(axis.line=element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        axis.title.x=element_blank(),
        axis.title.y=element_blank()) ),

(ggplot()+
  geom_sf(data = fishnet_vars, 
          aes(fill = PERMEABLE), 
          color= NA)+
  scale_fill_viridis(option = 'A',
                     direction =1)+
  labs(title = "Relative Permeable Land Cover\nper Fishnet Cell")+
  theme_minimal()+
  theme(axis.line=element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        axis.title.x=element_blank(),
        axis.title.y=element_blank())),
  
(ggplot()+
  geom_sf(data = fishnet_vars, 
          aes(fill = IMPERMEABLE), 
          color= NA)+
  scale_fill_viridis(option = 'A',
                     direction =1)+
  labs(title = "Relative Impermeable Land Cover\nper Fishnet Cell")+
  theme_minimal()+
  theme(axis.line=element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        axis.title.x=element_blank(),
        axis.title.y=element_blank())),
  
(ggplot()+
  geom_sf(data = fishnet_vars, 
          aes(fill = NATCOV), 
          color= NA)+
  scale_fill_viridis(option = 'A',
                     direction =1)+
  labs(title = "Relative Natural Land Cover\nper Fishnet Cell")+
  theme_minimal()+
  theme(axis.line=element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        axis.title.x=element_blank(),
        axis.title.y=element_blank())),

(ggplot()+
  geom_sf(data = fishnet_vars, 
          aes(fill = ELEVATION), 
          color= NA)+
  scale_fill_viridis(option = 'A',
                     direction =1)+
  labs(title = "Elevation Value \nper Fishnet Cell")+
  theme_minimal()+
  theme(axis.line=element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        axis.title.x=element_blank(),
        axis.title.y=element_blank())),

 (ggplot()+
  geom_sf(data = fishnet_vars, 
          aes(fill = DIST_SLOPE), 
          color= NA)+
  scale_fill_viridis(option = 'A',
                     direction =1)+
  labs(title = "Distance to Nearest Steep Slope \nper Fishnet Cell")+
  theme_minimal()+
  theme(axis.line=element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        axis.title.x=element_blank(),
        axis.title.y=element_blank())),

 (ggplot()+
  geom_sf(data = fishnet_vars, 
          aes(fill = SLOPE), 
          color= NA)+
  scale_fill_viridis(option = 'A',
                     direction =1)+
  labs(title = "Slope per Fishnet Cell")+
  theme_minimal()+
  theme(axis.line=element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        axis.title.x=element_blank(),
        axis.title.y=element_blank())),

# (ggplot()+
#   geom_sf(data = fishnet_vars, 
#           aes(fill = as.factor(WATERSHED)), 
#           color= NA)+
#   scale_fill_viridis(option = 'A',
#                      direction =1, 
#                      discrete=T)+
#   labs(title = "Slope per Fishnet Cell",
#        fill = "watershed")+
#   theme_minimal()+
#   theme(axis.line=element_blank(),
#         axis.text.x=element_blank(),
#         axis.text.y=element_blank(),
#         axis.ticks=element_blank(),
#         axis.title.x=element_blank(),
#         axis.title.y=element_blank())),

 (ggplot()+
  geom_sf(data = fishnet_vars, 
          aes(fill = as.factor(LANDCOVER)), 
          color= NA)+
  scale_fill_viridis(option = 'A',
                     direction =-1,
                     discrete = T,
                     labels = c("PERMEABLE", "NATURAL COVER","IMPERMEABLE", "STORMPOND"))+
  labs(title = "Relative Impermeable Land Cover\nper Fishnet Cell",
fill = "landcover")+
  theme_minimal()+
  theme(axis.line=element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        axis.title.x=element_blank(),
        axis.title.y=element_blank())), ncol = 2)

EXPLORATORY DATA ANALYSIS

The data was first brought into ArcGIS Pro to create new features using various geoprocessing & hydrology workflows. We use a fishnet of Calgary, with 200m cells, to assign values for each of our predictive features. the variables joined to our fishnet are as follows-

FLOOD_INUN - binary dependent variable- the observatio of a flood event. Floods = 1, Doesnt Flood = 0. LANDCOVER - The majority landcover type that occurs in each fishnet cell
ELEVATION - The average elevation value of each fishnet cell
ELEVATION_CAT - elevation value of each fishnet cell categorized by 15 intervals
SLOPE - the average slope degree of each fishnet cell
DIST_SLOPE - the distance of each fishnet cell to the nearets “steep slope” (slopes greater than 15 degrees)
PERMEABLE - percent value of permeable land cover for each fishnet cell
MAJ_PERM - binary variable. Cells > 50% = 1, cells < 50% = 0
IMPERMEABLE - percent value of impermeable land cover for each fishnet cell
MAJ_IMP - binary variable. Cells > 50% = 1, cells < 50% = 0
STREAM - percent value of stream for each fishnet cell
LAKE - percent value of lake for each fishnet cell
WATERSHED - Watershed hydrology workflow output from ArcGIS - the watershed in which each fishnet cell occurs
WATERSHED_AREA - The area (sq m) of the watershed in which the fishnet cell occurs

Next, we take a look at the relationship of each predictive variable to our dependent variable. We separate binary, categorical and continuous variables.

floodPlotVariables <- 
  fishnet_vars %>%
  as.data.frame() %>%
  dplyr::select(FLOOD_INNUN, NATCOV,  MAJ_NAT, MAJ_PER, IMPERMEABLE, MAJ_IMP, ELEVATION, ELEVATION_CAT, LANDCOVER, WATERSHED_AREA, STREAM, MAJ_STR, LAKE, MAJ_LAK, DIST_SLOPE, SLOPE) %>% mutate(ELEVATION_CAT = as.factor(ELEVATION_CAT), 
                                            LANDCOVER = as.character(LANDCOVER), 
                                            WATERSHED = as.factor(WATERSHED_AREA))%>%
  gather(key, value, NATCOV:WATERSHED)

floodPlotVariables$value <- as.numeric(floodPlotVariables$value)

#BINARY VARIABLES
floodPlotVariables%>%
  filter(key %in% c('MAJ_STR', 'MAJ_LAK', 'MAJ_NAT', 'MAJ_PER', 'MAJ_IMP'))%>%
  count(key, value, FLOOD_INNUN)%>%
ggplot(aes(as.factor(FLOOD_INNUN), n, fill= as.factor(FLOOD_INNUN))) +
      geom_bar(stat ="identity", position = "dodge") +
      geom_hline(aes(yintercept = 0), colour = 'grey15',linetype = 5, size = 1)+
     facet_wrap(~key, scales="free_y") +
     scale_fill_manual(values = pal,
                      labels = c("Does Not Flood","Floods"),
                      name = "") +
      labs(title = "Feature distributions with the Likelihood of Flooding",
           subtitle = "(BINARY)",
           x="Floods", y="Value", fill = " ") +
      theme_tufte()+theme(text = element_text(size = 10, family = "TT Arial"),
                      panel.grid.major.y = element_line(color='grey85',linetype = 'dashed'))

#CATEGORICAL
floodPlotVariables%>%
  filter(key %in% c('ELEVATION_CAT', 'LANDCOVER'))%>%
  count(key, value, FLOOD_INNUN) %>%
  ggplot(aes(value, n, fill = as.factor(FLOOD_INNUN))) + 
      geom_bar(position = "dodge2", stat = "identity")+
      facet_wrap(~key, scales="free") +
      scale_fill_manual(values = pal, 
                        labels = c("Does not Flood", "Floods"))+
      labs(title = "Feature distributions with the Likelihood of Flooding",
           subtitle = "(CATEGORICAL)",
           x="Floods", y="Value", fill = " ") +
      theme_tufte()+theme(text = element_text(size = 10, family = "TT Arial"),
                      panel.grid.major.y = element_line(color='grey85',linetype = 'dashed'))

#CONTINUOUS VARIABLES 
floodPlotVariables%>%
  filter(key %in% c('NATCOV', 'PERMEABLE', 'IMPERMEABLE', 'ELEVATION','STREAM','LAKE','DIST_SLOPE','SLOPE', 'WATERSHED'))%>%
  ggplot() + 
  geom_density(aes(value, color= as.factor(FLOOD_INNUN)), size = .75) +
  facet_wrap(~key, scales = "free")+
  scale_color_manual(values = pal, name=" ",labels =c("Doesn't Flood", "Floods"))+
  labs(title = "Feature distributions with the Likelihood of Flooding",
           subtitle = "(CONTINUOUS)", x="Floods", y="Value", color = " ") +
   theme_tufte()+
   theme(text = element_text(size = 10, family = "TT Arial"),
         panel.grid.major.y = element_line(color='grey85',linetype = 'dashed'))

BUILDING THE MODEL

#select vars 
fishnet_vars_model<- 
fishnet_vars%>%
  dplyr::select(FLOOD_INNUN, PERMEABLE, MAJ_STR, ELEVATION, DIST_SLOPE, WATERSHED_AREA, LANDCOVER, geometry)


set.seed(3456)
trainIndex <- createDataPartition(fishnet_vars_model$LANDCOVER, p = .70,
                                  list = FALSE,
                                  times = 1)

floodTrain <- fishnet_vars_model[ trainIndex,]
floodTest  <- fishnet_vars_model[-trainIndex,]
# #firts regression model
# floodModel <- glm(FLOOD_INNUN ~ ., 
#                     family="binomial"(link="logit"), data = floodTrain %>%
#                                                             as.data.frame() %>%
#                                                             dplyr::select(-geometry, -LANDCOVER))
#  summary(floodModel)

#Select Useful Variables - increase AUC
floodTrain_2<- floodTrain%>%
  dplyr::select(FLOOD_INNUN, PERMEABLE, MAJ_STR, ELEVATION, DIST_SLOPE, WATERSHED_AREA, geometry)
floodModel_2 <- glm(FLOOD_INNUN ~ ., 
                    family="binomial"(link="logit"), data = floodTrain_2%>%
                                                            as.data.frame() %>%
                                                            dplyr::select(-geometry))
summary(floodModel_2)
## 
## Call:
## glm(formula = FLOOD_INNUN ~ ., family = binomial(link = "logit"), 
##     data = floodTrain_2 %>% as.data.frame() %>% dplyr::select(-geometry))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3472  -0.2130  -0.0992  -0.0405   4.2840  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     4.017e+01  1.879e+00  21.383  < 2e-16 ***
## PERMEABLE      -1.578e-02  1.777e-03  -8.877  < 2e-16 ***
## MAJ_STR         2.727e+00  1.030e-01  26.461  < 2e-16 ***
## ELEVATION      -4.177e-02  1.831e-03 -22.809  < 2e-16 ***
## DIST_SLOPE     -6.219e-04  1.118e-04  -5.561 2.68e-08 ***
## WATERSHED_AREA  3.550e-09  3.096e-10  11.466  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5932.7  on 13121  degrees of freedom
## Residual deviance: 3183.3  on 13116  degrees of freedom
##   (26 observations deleted due to missingness)
## AIC: 3195.3
## 
## Number of Fisher Scoring iterations: 8

Model Output

After running a couple variations of our model, we choose MAJ_IMP, MAJ_STR, ELEVATION, DIST_SLOPE and WATERSHED_AREA as our significant features to predict flood inundation. The summary results of the regression model confirm that all five of these features are significant with their 0 p-value.

reg_sum<- broom::tidy(floodModel_2)%>%
  kable(align = c("l", "r", "r", "r", "r"))%>%
  kable_styling('striped')%>%
  footnote(general_title = "Feature Engineered Regression Summary",
           general = "Table 1")

reg_sum
term estimate std.error statistic p.value
(Intercept) 40.1747207 1.8788403 21.382723 0
PERMEABLE -0.0157794 0.0017775 -8.877418 0
MAJ_STR 2.7266338 0.1030420 26.461375 0
ELEVATION -0.0417749 0.0018315 -22.809378 0
DIST_SLOPE -0.0006219 0.0001118 -5.561019 0
WATERSHED_AREA 0.0000000 0.0000000 11.465907 0
Feature Engineered Regression Summary
Table 1

MODEL VALIDATION

To validate our model, we first understand the predicted probability of a fishnet cell in our test data set categorized as “flood” in our model. The histogram below tells us that the model is fantastic at predicting “0” or “not flooded” areas in Calgary. This is because the vast majority of our observations are not flood areas. The probability density plot indicates how strong our model is at predicting both flooded and not flooded fishnet cells. We again confirm that our model has strong ability to predict not flooded observations.

classProbs <- predict(floodModel_2, floodTest, type="response")
# hist(classProbs)
classProbs1 <- as.data.frame(classProbs)

ggplot()+
  geom_histogram(data = classProbs1, aes(classProbs), fill = "#ffa600")+
  labs(title = "Histogram of Class Probabilities")+
  theme_minimal()

testProbs <- data.frame(obs = as.numeric(floodTest$FLOOD_INNUN),
                        pred = classProbs)


testProbs_map <- floodTest%>%cbind(classProbs)%>%
  mutate(pred =  ifelse(classProbs > .25 ,1,0),
         obs = as.numeric(FLOOD_INNUN))%>%
  mutate(conf = case_when(pred == 1 & obs == 1 ~ "sensitivity", 
                          pred == 0 & obs == 0 ~ "specificity",
                          pred == 0 & obs == 1 ~ "NA", 
                          pred == 1 & obs == 0 ~ "NA"))


conf<-testProbs_map%>%
  filter(conf %in% c("sensitivity", "specificity"))%>%na.omit()

# st_write(conf, 'Data/Calgary/conf.shp')


ggplot(testProbs, aes(x = pred, fill=as.factor(obs))) + geom_density() +
  labs(title = "Density of Probability of Outcomes")+
  facet_grid(obs ~ .) + xlab("Probability") + geom_vline(xintercept = .25) +
  scale_fill_manual(values = pal,
                      labels = c("Not Flooded","Flooded"),
                      name = "")+theme_minimal()

Next we create a confusion matrix to understand how accurately our model is predicting across positive and negative observations. The metrics are as follows.

Predicted = 0, Observed = 0 —> True Negative
Predicted = 1, Observed = 1 —> True Positive
Predicted = 1, Observed = 0 —> False Positive
Predicted = 0, Observed = 1 —> False Negative

Sensitivity - the proportion of actual positives (1’s) that were predicted to be positive. Also known as “true positive rate”.

Specificity - The proportion of actual negatives (0’s) that were predicted to be negatives. Also known as “true negative rate”.

The quality of our model lies in how well it predicts True Positive observations (sensitivity) while minimizing the false negative output. The implication of a false negative is having areas become flooded that are not accounted for in the model, therefore not establishing appropriate protection for that area prior to the flood event. At a .25 threshold, our model predicts almost double the amount of true positives than false negatives.

testProbs$predClass  = ifelse(testProbs$pred > .25 ,1,0)

conf_matrix<- caret::confusionMatrix(reference = as.factor(testProbs$obs), 
                       data = as.factor(testProbs$predClass), 
                       positive = "1")
conf_matrix$table
##           Reference
## Prediction    0    1
##          0 5135  120
##          1  160  212
testProbs_map%>%
  filter(conf %in% c("sensitivity", "specificity"))%>%na.omit()%>%
ggplot()+
  geom_sf(aes(fill = conf), color = NA)+
  mapTheme

# conf_matrix$overall

The image below depicts the Kernel Density of sensitivity, or where our model predicts the event of flooding correctly.

ggplot(testProbs, aes(d = obs, m = pred)) + 
  geom_roc(n.cuts = 50, labels = FALSE, color = '#ffa600') + 
  style_roc(theme = theme_grey) +
  geom_abline(slope = 1, intercept = 0, size = 1.5, color = 'grey')+
  theme_minimal()

auc(testProbs$obs, testProbs$pred)
## Area under the curve: 0.952

The accuracy of our model is a bit high, as indicated by the 0.952 area under the curve, and the disproportionate rates of true negative observations. This may indicate that our model is not the most generalizable over unseen data.



Cross Validation

In our cross validation, we look at how the model performs across 100 random folds in our test data set. The histogram below confirms the lack of generalizability in our model because of the density towards 1.00 accuracy.

ctrl <- trainControl(method = "cv", 
                     number = 100, 
                     savePredictions = TRUE)

fishnet_vars_model[is.na(fishnet_vars_model)] = 0

cvFit <- train(as.factor(FLOOD_INNUN) ~ .,  data = fishnet_vars_model %>% 
                                                as.data.frame() %>%
                                                dplyr::select(-geometry, - LANDCOVER), 
               method="glm", family="binomial",
               trControl = ctrl)

cvFit$results%>%kable()%>%
  kable_styling('striped')%>%
  footnote(general_title = "100 Fold Cross Validation Summary",
           general = "Table 2")
parameter Accuracy Kappa AccuracySD KappaSD
none 0.9519256 0.4920769 0.0131758 0.141488
100 Fold Cross Validation Summary
Table 2
summary(cvFit)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.8709  -0.2641  -0.1560  -0.0902   3.5084  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.494e+01  1.108e+00  13.485  < 2e-16 ***
## PERMEABLE      -1.432e-02  1.470e-03  -9.743  < 2e-16 ***
## MAJ_STR         2.695e+00  8.104e-02  33.258  < 2e-16 ***
## ELEVATION      -1.740e-02  1.053e-03 -16.526  < 2e-16 ***
## DIST_SLOPE     -6.331e-04  8.896e-05  -7.117  1.1e-12 ***
## WATERSHED_AREA  2.650e-09  2.307e-10  11.487  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8498.7  on 18781  degrees of freedom
## Residual deviance: 5270.9  on 18776  degrees of freedom
## AIC: 5282.9
## 
## Number of Fisher Scoring iterations: 7
ggplot(as.data.frame(cvFit$resample), aes(Accuracy)) + 
  geom_histogram(bins = 20, binwidth = .01, fill = '#ffa600') +
  scale_x_continuous(limits = c(.6, 1)) +
  labs(title= "Accuracy of Model Across 100 Folds",
  x="Accuracy",
       y="Count")+
  theme_minimal()

allPredictions <- 
  predict(cvFit, fishnet_vars_model%>%dplyr::select(-LANDCOVER), type="prob")[,2]

fishnet_vars_model1 <- 
  cbind(fishnet_vars_model,allPredictions) %>%
  mutate(allPredictions = round(allPredictions * 100)) 

ggplot() + 
    geom_sf(data=fishnet_vars_model1, aes(fill=factor(ntile(allPredictions,5))), colour=NA) +
    scale_fill_manual(values = c("#edf8fb","#b3cde3","#8c96c6","#8856a7","#810f7c"),
                      labels=as.character(quantile(fishnet_vars_model1$allPredictions,
                                                 c(0.1,.2,.4,.6,.8),na.rm=T)),
                      name="Predicted\nProbabilities(%)\n(Quintile Breaks)") +
  geom_sf(data=flood, 
          fill = "dark blue", 
          color = NA,
          alpha = 0.6) +
  mapTheme +
  labs(title="Calgary Flood (Predicted vs Observed)")

The map above indicates where our model predicts flooding to happen based on our input variables. When we overlay actual observed flooding occurrences, we can get a better idea of the spatial relationship of our model’s predictions.

TESTING MODEL ON PITTSBURGH

Load Pittsurgh Data

Our next step is to load in data and use the same ArcGIS workflows to create our variables for Pittsburgh, PA.

Pit_bound <-  st_read("Data/Pitts/Allegheny_County_Municipal_Boundaries.geojson", crs = 'EPSG:4326')%>%
 st_transform('EPSG:3650')%>%
  filter(NAME == "PITTSBURGH")

pit_net <- st_as_sf(st_read('Data/Pitts/Pit_net_ft.shp'), crs = 4326)%>%
  st_transform('EPSG:3650')

#Predictors for pittsburgh

pits_imperm_sum<- read.csv('Data/Pitts/imperm.csv')%>%
  dplyr::select(uniqueID, SUM)%>%
  rename(IMPERMEABLE= SUM)
pits_perm<- read.csv('Data/Pitts/pit_permeable.csv')%>%
  dplyr::select(uniqueID, SUM)%>%
  rename(PERM = SUM)
pits_stream_sum<- read.csv('Data/Pitts/river.csv')%>%
  dplyr::select(uniqueID, SUM)%>%
  rename(STREAM = SUM)
pits_dist_slope <- read.csv('Data/Pitts/dist_slope.csv')%>%
  dplyr::select(uniqueID, MEAN)%>%
  rename(DIST_SLOPE= MEAN)
pits_elevation <- read.csv('Data/Pitts/pit_elev.csv')%>%
  dplyr::select(uniqueID, MIN)%>%
  rename(ELEVATION= MIN)
# pits_LC <-  read.csv('Data/Calgary/LC.csv')%>%
#   dplyr::select(uniqueID, MAJORITY)%>%
#   rename(LANDCOVER= MAJORITY)
pits_watershed <-  read.csv('Data/Pitts/pit_watershed.csv')%>%
  dplyr::select(uniqueID, MAJORITY)%>%
  rename(WATERSHED= MAJORITY)

pits_watershed_area<-  read.csv('Data/Pitts/pit_watershed_area.csv')%>%
  dplyr::select(VALUE, AREA)%>%
  rename(WATERSHED= VALUE)
pits_watershed<- pits_watershed%>%
  left_join(., pits_watershed_area, by = "WATERSHED")%>%
  mutate(WATERSHED_AREA = AREA)

pits_perm<- pits_perm%>%
  mutate(PERMEABLE = round((PERM/125) *100))%>%dplyr::select(-PERM)

# pits_watershed%>%
#   group_by(WATERSHED_AREA) %>%
#   summarise(max = max(AREA, na.rm=TRUE), 
#             min = min(AREA, na.rm=TRUE))
#join variables together by uniqueID
pit_vars <- full_join(pits_stream_sum, pits_perm, by = "uniqueID")%>%
  full_join(., pits_dist_slope, by= "uniqueID")%>%
  full_join(., pits_elevation, by= "uniqueID")%>%
  full_join(., pits_watershed%>%dplyr::select(uniqueID, WATERSHED_AREA), by= "uniqueID")

#join variables to the fishnet
#drop na will remove cells that dont have observations for dependent variable

pit_vars<- merge(pit_net, pit_vars, by = "uniqueID")%>%
  mutate(MAJ_STR = ifelse(STREAM >= 63, 1, 0))

pit_vars$PERMEABLE[is.na(pit_vars$PERMEABLE)] <- 0

grid.arrange(
  (ggplot()+
geom_sf(data = pit_vars, 
          aes(fill = ELEVATION), 
          color= NA)+
  scale_fill_viridis(option = 'A',
                     direction =1)+
  labs(title = "Elevation Value \nper Fishnet Cell")+
  theme_minimal()+
  theme(axis.line=element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        axis.title.x=element_blank(),
        axis.title.y=element_blank())),


(ggplot()+
geom_sf(data = pit_vars, 
          aes(fill = STREAM), 
          color= NA)+
  scale_fill_viridis(option = 'A',
                     direction =1)+
  labs(title = "Majority Stream \nper Fishnet Cell")+
  theme_minimal()+
  theme(axis.line=element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        axis.title.x=element_blank(),
        axis.title.y=element_blank())),
  

(ggplot()+
geom_sf(data = pit_vars, 
          aes(fill = DIST_SLOPE), 
          color= NA)+
  scale_fill_viridis(option = 'A',
                     direction =1)+
  labs(title = "Distance to Nearest Steep Slope \nper Fishnet Cell")+
  theme_minimal()+
  theme(axis.line=element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        axis.title.x=element_blank(),
        axis.title.y=element_blank())),
  

(ggplot()+
geom_sf(data = pit_vars%>% na.omit, 
          aes(fill = PERMEABLE), 
          color= NA)+
  scale_fill_viridis(option = 'A',
                     direction =1)+
  labs(title = "Relative Permeable Land Cover\nper Fishnet Cell")+
  theme_minimal()+
  theme(axis.line=element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        axis.title.x=element_blank(),
        axis.title.y=element_blank())),

(ggplot()+
geom_sf(data = pit_vars%>% na.omit, 
          aes(fill = as.factor(WATERSHED_AREA)), 
          color= NA)+
  scale_fill_viridis(option = 'A',
                     direction =1,
                     discrete = T)+
  guides(fill = FALSE)+
  labs(title = "Watershed Area \nper Fishnet Cell")+
  theme_minimal()+
  theme(axis.line=element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        axis.title.x=element_blank(),
        axis.title.y=element_blank())),ncol = 2, heights = c(8,8,8,8,8))

CONCLUSION

The of our model on Pittsburgh is a map that looks similar to our Calgary test predictions. Pittsburgh’s overall area is only about 53 sq miles while Calgary’s is over 300. Because we used the same cell size for our fishnets for both cities, the resolution of our pittsburgh predictions will be lower and more pixellated. This predictive model can now be used towards informed and proactive decision making for flood mitigation.

pit_vars<- pit_vars%>%dplyr::select(-STREAM)
pit_Predictions <- 
  predict(cvFit, pit_vars, type="prob")[,2]


pit_vars <- 
  cbind(pit_Predictions, pit_vars) %>%
  mutate(pit_Predictions = round(pit_Predictions * 100))


ggplot() + 
    geom_sf(data=pit_vars, aes(fill=factor(ntile(pit_Predictions,5)), 
                               geometry = geometry), colour=NA) +
    scale_fill_manual(values = c("#edf8fb","#b3cde3","#8c96c6","#8856a7","#810f7c"),
                      labels= c(1,2,3,4,5),
                      name="Predicted\nProbabilities(%)\n(Quintile Breaks)") +
  mapTheme +
  labs(title="Pittsburgh Flood Prediction")

pit_vars<-
  pit_vars%>%
  mutate(pit_flood = factor(ntile(pit_Predictions, 5)))

pit_flood<- pit_vars%>%
  filter(pit_flood == 5)


pit_flood2<- pit_vars%>%
  filter(pit_flood == 4)

ggplot()+
  geom_sf(data = pit_flood, fill = 'blue', color = NA)+
  geom_sf(data= Pit_bound, fill = NA, color = 'grey35', size = 1)+
  mapTheme

# st_write(pit_flood2, 'Data/Pitts/Pit_flood4.shp')