All analyses were done using R version 3.5.1. Packages required for this document can be found in the code chunk below:

suppressPackageStartupMessages(require(ggmap))
suppressPackageStartupMessages(require(spgwr))
suppressPackageStartupMessages(require(gstat))
suppressPackageStartupMessages(require(raster))
suppressPackageStartupMessages(require(rms))
suppressPackageStartupMessages(require(Hmisc))
suppressPackageStartupMessages(require(GGally))
suppressPackageStartupMessages(require(knitr))
suppressPackageStartupMessages(require(kableExtra))

Reading in and cleaning the data

The chunk below details some of the data cleaning, subsetting, and geocoding I did in order to make a clean, efficient dataset for this project.

## NOT RUN:
#Allegheny Property Assessment Data
#Source: https://catalog.data.gov/dataset/allegheny-county-property-assessments
alle = readr::read_csv("data/Allegheny_Data/assessments-18.csv")

# subsetting on important features for geocoding
address = c("PROPERTYHOUSENUM",
            "PROPERTYADDRESS",
            "PROPERTYCITY",
            "PROPERTYSTATE",
            "PROPERTYZIP")

# some of the features are repetitive, not nec. useful for this project
# taking the most useful ones below
features = c("MUNICODE","SCHOOLCODE","NEIGHCODE","OWNERCODE","CLASS",
             "USECODE","LOTAREA","RECORDDATE","SALEDATE",
             "SALEPRICE","SALECODE","COUNTYBUILDING","COUNTYLAND",
             "COUNTYTOTAL","COUNTYEXEMPTBLDG","LOCALBUILDING","LOCALLAND",
             "LOCALTOTAL","FAIRMARKETBUILDING","FAIRMARKETLAND","FAIRMARKETTOTAL",
             "STYLE","STORIES","YEARBLT","EXTERIORFINISH",
             "ROOF","BASEMENT","GRADE","CONDITION",
             "CDU","TOTALROOMS","BEDROOMS","FULLBATHS",
             "HALFBATHS","HEATINGCOOLING","FIREPLACES",
             "BSMTGARAGE","FINISHEDLIVINGAREA","PREVSALEPRICE","PREVSALEPRICE2") 
to_subset = c(address, features)
sub = alle[complete.cases(alle[,to_subset]),]

# taking random uniform subset of size 5000
# makes computations much simpler
set.seed(1717)
sampledPop = round(runif(5000,min=1,max=nrow(sub)))
samp = sub[sampledPop,]

#geocoding
# register_google("<API KEY HERE>")

# loop through values to geocode
# full Allegheny county set would take ~12hrs to geocode with google
# sample of 5000 takes ~45min
samp$lon = NA
samp$lat = NA

for(i in 1:nrow(samp)){
  addy = geocode(paste(samp[i,address],collapse=" "), 
        output = "latlon",
        source = "google")
  samp[i,'lon'] = addy$lon
  samp[i,'lat'] = addy$lat
}

#save the dataset
saveRDS(samp,"AlleghenyPropertySample.Rds")

Geocoding setup

Here, I import the cleaned Allegheny County property evaluation dataset, and also load the shapefiles for Allegheny County using the Database of Global Administrative Areas (GADM) data.

samp = readRDS("AlleghenyCountySample.Rds")
us =getData('GADM',country="United States",level=2) 
allegheny = subset(us,NAME_1=="Pennsylvania" & NAME_2 == 'Allegheny')
p4 = proj4string(allegheny)

# plot(allegheny)
# points(samp$lon,samp$lat,
#        col="dodgerblue",
#        pch=20)

Lipschitz embedding method

The following chunk is a function to perform the Lipschitz embedding procedure detailed in Appendix 1 of PAPER CITATION.

lipschitz_embedding <- function(d=20,k=10,dat){
  ## --- TRANSFORM TO LIPSCHITZ SPACE
  
  # number of observations
  N = nrow(dat)
  
  # matrix to hold embedded coords
  lipschitz.coords = matrix(0,nrow=N,ncol=d)
  
  for(i in 1:d){
    # random spatial sample of size k
    ## sample contained in england shapefile (e.g. map)
    ref.set = spsample(x=allegheny, #also could be newYork
                       n=k,
                       type = "random",
                       iter=+Inf)
    
    # find distances from each observation to random sample
    temp = spDists(x=SpatialPoints(dat[,c('lon','lat')],
                                   proj4string = CRS(p4)),
                   y=ref.set,
                   longlat = TRUE)
    
    # find minimum distance 
    temp = apply(X = temp,
                 MARGIN = 1,
                 FUN = min)
    
    # store in matrix
    lipschitz.coords[,i] = temp
  }
  
  ## --- USE DISTANCE FOR KNN
  
  #calculate approximate distance function
  approx.dist = as.matrix(dist(lipschitz.coords,
                   method="maximum",
                   diag=TRUE,
                   upper=TRUE))
  
  list(lipschitz.coords,approx.dist)
}

Fitting a simple OLS model

Since this is essentially a toy example for the problem at hand, I wanted to consider a simple model to implement in the GWR simulations. Since GWR is just a locally-weight linear regression, I ran and explored some simple model-building techniques using Frank Harrel’s rms package.

samp$GRADE = factor(samp$GRADE,
                    levels = c("X+","X","X-",
                               "A+","A","A-",
                               "B+","B","B-",
                               "C+","C","C-",
                               "D+","D","D-"))

vars = c("CLASS","LOTAREA",
         "SALEPRICE",
         "STYLE","STORIES","YEARBLT","EXTERIORFINISH",
         "ROOF","BASEMENT","GRADE","CONDITION",
         "CDU","TOTALROOMS","BEDROOMS","FULLBATHS",
         "HALFBATHS","HEATINGCOOLING","FIREPLACES",
         "BSMTGARAGE","FINISHEDLIVINGAREA"
         ,"PREVSALEPRICE","PREVSALEPRICE2"
         ) 

form = as.formula(paste0("COUNTYTOTAL~",
                         paste(vars,collapse=" + ")))
mod1 = ols(form,data=samp)
dropped = fastbw(mod1)
# form2 = as.formula(paste0("COUNTYTOTAL~",
#                          paste(dropped$names.kept[-c(1,2,7,8,9,10)],collapse=" + ")))
# form.withGrade = as.formula(paste0("COUNTYTOTAL~",
#                          paste(dropped$names.kept[-c(1,2,4,6,8)],collapse=" + ")))
form.withoutGrade = as.formula(paste0("COUNTYTOTAL~",
                         paste(dropped$names.kept[-c(1,2,4,5,6,8)],collapse=" + ")))
vals.withoutGrade = dropped$names.kept[-c(1,2,4,5,6,8)]


# ols(form.withGrade, data=samp) # adj r2 = .847
# ols(form.withoutGrade, data=samp) #adj r2 = .763

Calculating the betas

Below is the meat of the GWR process, which is done in lieu of the pre-programmed models in the spgwr package. The work done in that package was particularly helpful, since source code and functions were used to develop the following code. Specifically, the gwr() function in the spgwr did not allow for as much flexibility as desired for implementing the weights based on distance, so the source code calculations in gwr() were used instead. The same approach was taken for calculating the quasi-global \(R^2\) from the gwr() model, which is essentially the mean \(R^2\) over all points.

#reproducibility
set.seed(1717)

#subset for less computation
vals = round(runif(100,0,5000))

#vals = 1:5000 #all data (full sample)

#scale and center, subset for columns of interest
clean_samp = as.matrix(samp[,c("COUNTYTOTAL",
                               vals.withoutGrade,"lon","lat")]) %>% scale()
cs1 = data.frame(clean_samp)
cs1 = cs1[vals,]

#calculate bandwidth for later W matrix calculations
# bandw = gwr.sel(form.withoutGrade,
#                 data=cs1,
#                 coords=cbind(cs1$lon,cs1$lat),
#                 longlat=TRUE)
bandw = 319.201408722481 #selected using full sample dataset above 

#get orig. distances[o = orig.], use to calculate bisquare weights
oSamp = clean_samp 
oDist = spDists(SpatialPoints(oSamp[,c('lon','lat')]))
w1 = gwr.bisquare(oDist^2,d=bandw) 

#get lipschitz distances[l = lipschitz], use to calculate bisquare weights
lSamp = lipschitz_embedding(d=20,k=5,clean_samp)
lDist = lSamp[[2]]
w2 = gwr.bisquare(lDist^2,d=bandw)

#data structures to hold results from loop below
oBetaCoef = matrix(NA,
                  ncol=ncol(clean_samp[,2:7])+1,
                  nrow= nrow(w1))
oR2 = numeric(nrow(w1))
oR2.a = numeric(nrow(w1))

lBetaCoef = matrix(NA,
                  ncol=ncol(clean_samp[,2:7])+1,
                  nrow = nrow(w1))
lR2 = numeric(nrow(w1))
lR2.a = numeric(nrow(w1))
  
#calculate all betas and R2 for every location/model
for(i in 1:nrow(w1)){ #fit models and get coef for both iterations
  #--original data
  
  #fit locally weighted linear model
  oMod_i = lm(form.withoutGrade,
              weights=w1[i,],
              data=data.frame(clean_samp))
  oBetaCoef[i,] = coefficients(oMod_i) #get coef.
  
  oSumm = summary(oMod_i) #get summ.
  oR2[i] = oSumm$r.squared #get R2
  oR2.a[i] = oSumm$adj.r.squared #get adj. R2
  
  #--lipschitz data
  
  #fit locally weighted linear model, using Lipschitz distance mat
  lMod_i = lm(form.withoutGrade,
              weights=w2[i,],
              data=data.frame(clean_samp))
  lBetaCoef[i,] = coefficients(lMod_i) #get coef.
  
  lSumm = summary(lMod_i) #get summ.
  lR2[i] = lSumm$r.squared # get R2
  lR2.a[i] = lSumm$adj.r.squared #get adj. R2
  
  #if(i%%100==0){print(i)}
}

#calculate quasi-global (mean) R2
oQuasiR2 = mean(oR2)
oQuasiR2.a = mean(oR2.a)

lQuasiR2 = mean(lR2)
lQuasiR2.a = mean(lR2.a)

saveRDS(object = oBetaCoef,file = "oBetaCoef.Rds")
saveRDS(object = lBetaCoef,file = "lBetaCoef.Rds")

vals.R2 = rbind(cbind(oQuasiR2,oQuasiR2.a),
                cbind(lQuasiR2,lQuasiR2.a))
colnames(vals.R2) = c("$R^2$","Adj. $R^2$")
rownames(vals.R2) = c("Original","Lipschitz")
saveRDS(object = vals.R2,file="R2values.Rds")

Graphical summaries

Data setup

oBetaCoef = readRDS("oBetaCoef.Rds")
lBetaCoef = readRDS("lBetaCoef.Rds")

lBetaCoef.df = data.frame(lBetaCoef)
oBetaCoef.df = data.frame(oBetaCoef)
colnames(lBetaCoef.df) = colnames(oBetaCoef.df) = c("Intercept",
                                                    "Sale Price",
                                                    "Total Rooms",
                                                    "Half Baths",
                                                    "Fireplaces",
                                                    "Finished Living Area",
                                                    "Previous Sale Price")

getScalePlot = function(plot_var){
  plot_dat = data.frame(rbind(cbind("Original",oBetaCoef.df[,plot_var]),
                              cbind("Lipschitz",lBetaCoef.df[,plot_var])))
  colnames(plot_dat) = c("Data","Values")
  plot_dat$Values = as.double(as.character(plot_dat$Values))
  
  ggplot(plot_dat, aes(x=Values,color=Data,fill=Data)) +
    geom_density(aes(y=..scaled..),alpha=0.5) +
    ylab("Scaled Density") +
    scale_color_manual(values = c("darkorange","darkgreen")) +
    scale_fill_manual(values = c("darkorange","darkgreen")) +
    ggtitle(paste0("Density of coefficient estimates for '",plot_var,"' \nby data-generating method"))
}

Intercept

p1 = getScalePlot("Intercept")
p1

# ggsave("ggplots/intercept.png")

Sale Price

p2 = getScalePlot("Sale Price")
p2