Wednesday, May 28, 2014

Whatcom County Voter Database 05.20.2014





The corresponding political piece for this article is here.
The active voter Whatcom County database as of 05.20.2014 is  (row) 36  *  (column) 126283 =  (ncell) 4546188. On an 8 GB I-5 laptop on running 64 bit R 3.1 this isn't much of a problem. There are a number of different approaches/packaged I have found to using R for large data: 
  • data.table
  • RPostgres
  • plyr
  • sqldf
For my laptop, ncell < 5M is handled well enough by  the base stats and graphics packages, although I really like the plyr count function.  Both RPostgres and data.table manipulate the code a faster at some added complexity and increased functionality.  Being able to keep multiple voter databases in Postgres, perform complex SQL queries from the RPostrgres DBI interface and display the results/graphs in R is probably the best bet. I will discuss this in another post.  For the graphs above, I rely heavily on an awkward and nearly unreadable subsetting  and concatenation as in this line:

b2009__ <-(subset(BallotCounted2009,grepl("NA",BallotCounted2009),select = freq))[2,]

BallotCounted is the table of summarized votes for the years 2009 - 2013 (General Election only). I subset using grepl and then am forced to subset again apparently because either grepl is seeing '0' as equivalent to 'NA' or the subset function itself reads a data.frame column as one line?


> BallotCounted
   x  freq  x  freq  x  freq  x  freq  x  freq
1  0 53080  0 17044  0 38248  0 21075  0 38042
2  1 69012  1 97212  1 63444  1 75750  1 52735
3 NA  4190 NA 12026 NA 24590 NA 29457 NA 35505


> BallotCounted2009
   x  freq
1  0 38042
2  1 52735
3 NA 35505

> subset(BallotCounted2009,grepl("NA",BallotCounted2009),select = freq)
   freq
1 38042
3 35505
> subset(BallotCounted2009,grepl("NA",BallotCounted2009),select = freq)[2,]
[1] 35505

Nothing I try here helps:

> subset(BallotCounted2009,grepl("NA",BallotCounted2009,useBytes=TRUE))
   x  freq
1  0 38042
3 NA 35505
> subset(BallotCounted2009,grepl("NA",BallotCounted2009,fixed=TRUE))
   x  freq
1  0 38042
3 NA 35505
> subset(BallotCounted2009,grepl("NA",BallotCounted2009,perl=TRUE))
   x  freq
1  0 38042
3 NA 35505


Clearly, some of this code is less than optimized.


#require(sqldf)
require(plyr)
setwd("C:/Politics")
VDB_05.20.2014 <- read.delim("05.20.2014.txt")
VDB <- VDB_05.20.2014
VDB_Voted <- VDB[,c(1,32:36)]
#sqldf("Select Count(*) from VDB_Voted where BallotCounted_5 = 1")
#sqldf("Select Count(*) from VDB_Voted where BallotCounted_4 = 1")
#sqldf("Select Count(*) from VDB_Voted where BallotCounted_3 = 1")
#sqldf("Select Count(*) from VDB_Voted where BallotCounted_2 = 1")
#sqldf("Select Count(*) from VDB_Voted where BallotCounted_1 = 1")

BallotCounted2013 <- (count(VDB_Voted$BallotCounted_1))
BallotCounted2012 <- (count(VDB_Voted$BallotCounted_2))
BallotCounted2011 <- (count(VDB_Voted$BallotCounted_3))
BallotCounted2010 <- (count(VDB_Voted$BallotCounted_4))
BallotCounted2009 <- (count(VDB_Voted$BallotCounted_5))
BallotCounted <- cbind(BallotCounted2013,BallotCounted2012,BallotCounted2011,BallotCounted2010,BallotCounted2009)

par(mfrow=c(1,4))
b2009_ <-(subset(BallotCounted2009,x == 1,select = freq)[1,])
b2010_ <-(subset(BallotCounted2010,x == 1,select = freq)[1,])
b2011_ <-(subset(BallotCounted2011,x == 1,select = freq)[1,])
b2012_ <-(subset(BallotCounted2012,x == 1,select = freq)[1,])
b2013_ <-(subset(BallotCounted2013,x == 1,select = freq)[1,])
barplot( c(b2009_,b2010_,b2011_,b2012_,b2013_),names.arg=c("2009","2010","2011","2012","2013"),xlab="Voted",ylim=c(0,100000),col="red")

b2009 <-(subset(BallotCounted2009,x == 0,select = freq)[1,])
b2010 <-(subset(BallotCounted2010,x == 0,select = freq)[1,])
b2011 <-(subset(BallotCounted2011,x == 0,select = freq)[1,])
b2012 <-(subset(BallotCounted2012,x == 0,select = freq)[1,])
b2013 <-(subset(BallotCounted2013,x == 0,select = freq)[1,])
barplot( c(b2009,b2010,b2011,b2012,b2013),names.arg=c("2009","2010","2011","2012","2013"),xlab="Not Voted",ylim=c(0,100000),col="blue")

barplot( c((b2009_/(b2009 + b2009_)),
(b2010_/(b2010 + b2010_)),
(b2011_/(b2011 + b2011_)),
(b2012_/(b2012 + b2012_)),
(b2013_/(b2013 + b2013_)) ),
names.arg=c("2009","2010","2011","2012","2013"),xlab="Voted/(Voted + Not Voted)",col="dark gray")

b2009__ <-(subset(BallotCounted2009,grepl("NA",BallotCounted2009),select = freq))[2,]
b2010__ <-(subset(BallotCounted2010,grepl("NA",BallotCounted2009),select = freq))[2,]
b2011__ <-(subset(BallotCounted2011,grepl("NA",BallotCounted2009),select = freq))[2,]
b2012__ <-(subset(BallotCounted2012,grepl("NA",BallotCounted2009),select = freq))[2,]
b2013__ <-(subset(BallotCounted2013,grepl("NA",BallotCounted2009),select = freq))[2,]
barplot( c(b2009__,b2010__,b2011__,b2012__,b2013__),names.arg=c("2009","2010","2011","2012","2013"),ylab="N/A : 05.20.2014 voters not registered in these elections",ylim=c(0,100000),col="light gray")

mtext("Whatcom County General Elections 2009 - 2013 for voters active May 20 2014 only",side=3,outer=TRUE,line=-1)
mtext("Totals for Voters active May 20 2014 only", side=1, outer=TRUE,line=-1)

par(mfrow=c(1,2))
b2009_ <-(subset(BallotCounted2009,x == 1,select = freq)[1,])
b2010_ <-(subset(BallotCounted2010,x == 1,select = freq)[1,])
b2011_ <-(subset(BallotCounted2011,x == 1,select = freq)[1,])
b2012_ <-(subset(BallotCounted2012,x == 1,select = freq)[1,])
b2013_ <-(subset(BallotCounted2013,x == 1,select = freq)[1,])
barplot( c(b2009_,b2010_,b2011_,b2012_,b2013_),names.arg=c("2009","2010","2011","2012","2013"),ylim=c(0,100000),xlab="Voted",col="red")

b2009 <-(subset(BallotCounted2009,x == 0,select = freq)[1,])
b2010 <-(subset(BallotCounted2010,x == 0,select = freq)[1,])
b2011 <-(subset(BallotCounted2011,x == 0,select = freq)[1,])
b2012 <-(subset(BallotCounted2012,x == 0,select = freq)[1,])
b2013 <-(subset(BallotCounted2013,x == 0,select = freq)[1,])
barplot( c(b2009,b2010,b2011,b2012,b2013),names.arg=c("2009","2010","2011","2012","2013"),xlab="Not Voted",ylim=c(0,100000),col="blue")
mtext("Whatcom County General Elections 2009 - 2013 for voters active May 20 2014 only",side=3,outer=TRUE,line=-1)

par(mfrow=c(1,2))
plot( c(b2009_,b2010_,b2011_,b2012_,b2013_),type="p",pch=15,cex=5,xlab="2009 - 2013 General Election : Voted(Blue) Not Voted(Red)",ylab="Votes",ylim=c(0,100000),col.lab="red",col="blue")
points( c(b2009,b2010,b2011,b2012,b2013),type="p",pch=15,cex=5,col="red")

barplot( c((b2009_/(b2009 + b2009_)),
(b2010_/(b2010 + b2010_)),
(b2011_/(b2011 + b2011_)),
(b2012_/(b2012 + b2012_)),
(b2013_/(b2013 + b2013_)) ),
names.arg=c("2009","2010","2011","2012","2013"),xlab="Voted/(Voted + Not Voted)",col="dark gray")
mtext("Whatcom County General Elections 2009 - 2013 for voters active May 20 2014 only",side=3,outer=TRUE,line=-1)

par(mfrow=c(1,2))
b2009__ <-(subset(BallotCounted2009,grepl("NA",BallotCounted2009),select = freq))[2,]
b2010__ <-(subset(BallotCounted2010,grepl("NA",BallotCounted2009),select = freq))[2,]
b2011__ <-(subset(BallotCounted2011,grepl("NA",BallotCounted2009),select = freq))[2,]
b2012__ <-(subset(BallotCounted2012,grepl("NA",BallotCounted2009),select = freq))[2,]
b2013__ <-(subset(BallotCounted2013,grepl("NA",BallotCounted2009),select = freq))[2,]
barplot( c(b2009__,b2010__,b2011__,b2012__,b2013__),names.arg=c("2009","2010","2011","2012","2013"),xlab="N/A : 05.20.2014 voters not registered in these elections with total votes in red",ylim=c(0,100000),col="light gray")
points( c(b2009_,b2010_,b2011_,b2012_,b2013_),type="p",pch=15,cex=5,col="red")
# mtext("Whatcom County General Elections 2009 - 2013 for voters active May 20 2014 only",side=3,outer=TRUE,line=-1)

barplot( c((b2009 + b2009_),
(b2010 + b2010_),
(b2011 + b2011_),
(b2012 + b2012_),
(b2013 + b2013_) ),
names.arg=c("2009","2010","2011","2012","2013"),xlab="Voted + Not Voted",col="light blue")
mtext("Whatcom County General Elections 2009 - 2013 for voters active May 20 2014 only",side=3,outer=TRUE,line=-1)


No comments:

Post a Comment