Sunday, July 14, 2019

Divining come and gone voters from two different data slices four months apart

# Divining come and gone voters from two different data slices four months apart
# Set proper paths for your data and data sets
# fread chokes on too many columns thus a workaround with rbind
# RMF 07/14/2019
# This is designed to be memory efficient non SQL

library(data.table)
library(lubridate)
library(lattice)



## With Status Code,CountyCode,PrecinctCode,
setwd("C:\\Users\\rferrisx\\Downloads\\December2018")
r1scp <- fread("201812_VRDB_Extract.txt",
nrows=682858,
strip.white = TRUE,
fill = TRUE,
blank.lines.skip = TRUE,
select=c("StateVoterID","StatusCode","CountyCode","PrecinctCode"),
encoding="UTF-8")[,.(SVI.12.18=StateVoterID,StatusCode,CountyCode,PrecinctCode)]

r2scp <- fread("201812_VRDB_Extract.txt",
strip.white = TRUE,
fill = TRUE,
blank.lines.skip = TRUE,
skip=682858,
select=c("V1","V21","V22","V37"),
encoding="UTF-8")[,.(SVI.12.18=V1,StatusCode=V37,CountyCode=V21,PrecinctCode=V22)];

r3scp <- rbind(r1scp,r2scp)
rm(r1scp)
rm(r2scp)

setwd("C:\\Users\\rferrisx\\Downloads\\AprilVRDB")
s1scp <- fread("201904_VRDB_Extract.txt",
nrows=674170,
strip.white = TRUE,
fill = TRUE,
blank.lines.skip = TRUE,
select=c("StateVoterID","StatusCode","CountyCode","PrecinctCode"),
encoding="UTF-8")[,.(SVI.04.19=StateVoterID,StatusCode,CountyCode,PrecinctCode)]

s2scp <- fread("201904_VRDB_Extract.txt",
strip.white = TRUE,
fill = TRUE,
blank.lines.skip = TRUE,
skip=674170,
select=c("V1","V21","V22","V37"),
encoding="UTF-8")[,.(SVI.04.19=V1,StatusCode=V37,CountyCode=V21,PrecinctCode=V22)]

s3scp <- rbind(s1scp,s2scp)
rm(s1scp)
rm(s2scp)

setkey(r3scp,SVI.12.18)
setkey(s3scp,SVI.04.19)

# Comparisons

# in Apr not Dec
newApr <- setnames(r3scp[s3scp,.(SVI.12.18,SVI.04.19),by=.EACHI],c("Dec.plus.Apr","not.in.Dec","Apr.plus.Dec"))[is.na(not.in.Dec),.(newApr=Apr.plus.Dec)]
NewApr.scp <- s3scp[SVI.04.19 %in% newApr$newApr,]

# in Dec not April
onlyDec <- setnames(s3scp[r3scp,.(SVI.04.19,SVI.12.18),by=.EACHI],c("Apr.plus.Dec","not.in.Apr","Dec.plus.April"))[is.na(not.in.Apr),.(onlyDec=Dec.plus.April)]
OnlyDec.scp <- r3scp[SVI.12.18 %in% onlyDec$onlyDec,]


# By Specific County: Whatcom
OnlyDec.scp[CountyCode == "WM",]
OnlyDec.scp[CountyCode == "WM",.N,.(StatusCode)]

NewApr.scp[CountyCode == "WM",]
NewApr.scp[CountyCode == "WM",.N,.(StatusCode)]


# By Specific County: King
OnlyDec.scp[CountyCode == "KI",]
OnlyDec.scp[CountyCode == "KI",.N,.(StatusCode)]

NewApr.scp[CountyCode == "KI",]
NewApr.scp[CountyCode == "KI",.N,.(StatusCode)]


# List by each county by specific status
fsum <- function(x) {sum(x,na.rm=TRUE)};
OnlyDec.scp[,.N,.(StatusCode,CountyCode)][,dcast(.SD,CountyCode ~ StatusCode,value.var="N",fun.aggregate=fsum)][order(-I)]
NewApr.scp[,.N,.(StatusCode,CountyCode)][,dcast(.SD,CountyCode ~ StatusCode,value.var="N",fun.aggregate=fsum)][order(-A)]


# For charts, need to Status Code check column lengths:
fsum <- function(x) {sum(x,na.rm=TRUE)};
NewApr.scp[,.N,.(StatusCode,CountyCode)][,
dcast(.SD,CountyCode ~ StatusCode,value.var="N",fun.aggregate=fsum)][order(-A)][,
setnames(.SD,c("CountyCode","Active","ADUP","Inactive","Pending","PDUPL"))][][,
barchart(~Active+Inactive+Pending+ADUP+PDUPL | as.factor(CountyCode),data=.SD,Stack=T,origin=0,allow.multiple=TRUE)]

OnlyDec.scp[,.N,.(StatusCode,CountyCode)][,
dcast(.SD,CountyCode ~ StatusCode,value.var="N",fun.aggregate=fsum)][order(-A)][,
setnames(.SD,c("CountyCode","Active","ADUP","Inactive","IDUP","Pending","PDETH","PDUPL","PFELN"))][][, barchart(~Active+Inactive+Pending+ADUP+PDUPL+IDUP+PDETH+PFELN | as.factor(CountyCode),data=.SD,Stack=T,origin=0,allow.multiple=TRUE)]

#Top Counties only
dev.new()
NewApr.scp[CountyCode %in% c("KI","PI","SN","CR","SP"),.N,.(StatusCode,CountyCode)][,
dcast(.SD,CountyCode ~ StatusCode,value.var="N",fun.aggregate=fsum)][order(-A)][,
setnames(.SD,c("CountyCode","Active","ADUP","Inactive","Pending","PDUPL"))][][,
barchart(~Active+Inactive+Pending+ADUP+PDUPL | as.factor(CountyCode),data=.SD,Stack=T,origin=0,allow.multiple=TRUE)]

dev.new()
OnlyDec.scp[CountyCode %in% c("KI","PI","SN","CR","SP"),.N,.(StatusCode,CountyCode)][,
dcast(.SD,CountyCode ~ StatusCode,value.var="N",fun.aggregate=fsum)][order(-A)][,
setnames(.SD,c("CountyCode","Active","ADUP","Inactive","IDUP","PDETH","PDUPL","PFELN"))][][,
barchart(~Active+Inactive+ADUP+PDUPL+IDUP+PDETH+PFELN | as.factor(CountyCode),data=.SD,Stack=T,origin=0,allow.multiple=TRUE)]



No comments:

Post a Comment