################################################################
# http://www.goldsheet.com/historic/cbblog06.html

# If you have an internet connection:
#
#	Visit http://www.goldsheet.com
#	"Past Scores & Ratings"
# 	"Final College Basketball Logs & Ratings"
#	"2006-2007 College Basketball Scores & Pointspreads"
#

# First, let's scan in the web page (either directly from
# the web, or from a file if the network is unavailable):

x <- scan("http://www.goldsheet.com/historic/cbblog06.html",
          what="", sep="\n")
# x <- scan("cbblog06.html", what="", sep="\n")

# Use R's regular expressions to strip out all the nasty HTML:
y <- gsub("<[^<>]*>", "", x)
y <- gsub("&amp;", "&", y)

# Quick and dirty: limit our attention to lines that matter:
z <- y[36:6955]

# Get the column of spread results:
result <- substring(z, 29, 29)

# Get and process the column of point spreads:
spread <- substring(z, 32, 35)
spread <- gsub("P", "0", spread)
spread <- as.numeric(gsub("'", ".5", spread))

# Get and process the scores:
scores <- substring(z, 36, 44)

# Grab the dates:
dates <- substring(z, 1, 5)

# Grab the home/visitor/neutral site variable:
site <- substring(z, 45)
site[grep("H", site)] <- "H"
site[grep("V", site)] <- "V"
site[site!="H" & site!="V"] <- "N"

select <- (substring(z, 40, 40) == "-")
result <- result[select]
spread <- spread[select]
scores <- scores[select]
dates <- dates[select]
site <- site[select]

temp <- strsplit(scores, "-")
temp <- matrix(as.numeric(unlist(temp)),
               length(scores), 2, byrow=TRUE)

bb <- data.frame(spresult=result, spread=spread,
                 hscore=temp[,1], vscore=temp[,2], date=dates,
                 site=site)
bb <- bb[!is.na(spread),]
rownames(bb) <- NULL

bb$gamespread <- bb$vscore - bb$hscore

bb <- bb[bb$site=="H", c("spread", "gamespread")]
write.table(bb, "cbb2006.csv", sep=",", row.names=F, col.names=T)