################################################################ # http://www.goldsheet.com/historic/cbblog06.html # If you have an internet connection: # # Visit http://www.goldsheet.com # "Past Scores & Ratings" # "Final College Basketball Logs & Ratings" # "2006-2007 College Basketball Scores & Pointspreads" # # First, let's scan in the web page (either directly from # the web, or from a file if the network is unavailable): x <- scan("http://www.goldsheet.com/historic/cbblog06.html", what="", sep="\n") # x <- scan("cbblog06.html", what="", sep="\n") # Use R's regular expressions to strip out all the nasty HTML: y <- gsub("<[^<>]*>", "", x) y <- gsub("&", "&", y) # Quick and dirty: limit our attention to lines that matter: z <- y[36:6955] # Get the column of spread results: result <- substring(z, 29, 29) # Get and process the column of point spreads: spread <- substring(z, 32, 35) spread <- gsub("P", "0", spread) spread <- as.numeric(gsub("'", ".5", spread)) # Get and process the scores: scores <- substring(z, 36, 44) # Grab the dates: dates <- substring(z, 1, 5) # Grab the home/visitor/neutral site variable: site <- substring(z, 45) site[grep("H", site)] <- "H" site[grep("V", site)] <- "V" site[site!="H" & site!="V"] <- "N" select <- (substring(z, 40, 40) == "-") result <- result[select] spread <- spread[select] scores <- scores[select] dates <- dates[select] site <- site[select] temp <- strsplit(scores, "-") temp <- matrix(as.numeric(unlist(temp)), length(scores), 2, byrow=TRUE) bb <- data.frame(spresult=result, spread=spread, hscore=temp[,1], vscore=temp[,2], date=dates, site=site) bb <- bb[!is.na(spread),] rownames(bb) <- NULL bb$gamespread <- bb$vscore - bb$hscore bb <- bb[bb$site=="H", c("spread", "gamespread")] write.table(bb, "cbb2006.csv", sep=",", row.names=F, col.names=T)