#' get the extension of the files if they have one or "" otherwise
getExtension <- function( files ){
    out <- character( length(files) )
    has.ext <- grepl( "[.]", basename(files) )
    out[ has.ext ] <- sub( "^.*\\.(.*?)$", "\\1", files[has.ext], perl = T )
    out
}

#' process one svn log item, something like: 
# ------------------------------------------------------------------------
# r50000 | ripley | 2009-10-09 10:34:17 +0200 (Fri, 09 Oct 2009) | 1 line
# Changed paths:
#    M /branches/R-2-10-branch/src/library/stats/R/plot.lm.R
# 
# port r49999 from trunk
process_chunk <- function( txt ){
    if( length( txt ) == 1L ) return( NULL )
    
    header_line <- strsplit( txt[2L], " | ", fixed=TRUE )[[1]][ c(1L, 2L, 3L) ]
    
    # revision number
    revision <- substring( header_line[1], 2 )
    
    # who commited the revision
    author <- header_line[2]
    
    # these are uninteresting automatic commits (only data-stamp is involved)
    if( author %in% c("apache", "root" ) ) return(NULL)
    
    date     <- substring( header_line[3], 1, 10 )
    
    # process lines associated with files
    file_lines <- txt[ seq.int( 4L, which( txt == "" )[1L] - 1L ) ]
    file_lines <- strsplit( sub( "^ +", "", file_lines), " " )
    
    # M, A, D, ...
    actions <- sapply( file_lines, "[", 1L )
    
    # the files
    files   <- sapply( file_lines, "[", 2L )
    
    # we format the result as a matrix
    # this is much more efficient than formatting as a data frame
    # at that stage
    nlines  <- length( actions )
    matrix( c( 
        rep.int( revision, nlines ), 
        rep.int( author, nlines ), 
        rep.int( date, nlines ), 
        actions, 
        files ), nrow = nlines )
    
}

# bind all results
data <- local( {
    lines <- readLines( "rcpp.svnlog" )
    index <- cumsum( grepl( "^-+$", lines ) )
    commits <- split( lines, index )
    do.call( rbind, lapply( commits, process_chunk ) )
} )
colnames( data ) <- c( "revision", "author", "date", "action", "file" )

# extract some more information
data <- within( as.data.frame( data, stringsAsFactors = FALSE ), {
    date <- as.Date( date, format = "%Y-%m-%d") 
    revision <- as.integer( revision )
    extension <- getExtension( file )
    weekday  <- weekdays( date )
    year <- as.integer( format( date, "%Y" ) )
    month <- month.abb[ as.integer( format( date, "%m") ) ]    
} )
data <- subset( data, ! author %in% "stefan7th" )
data$author <- factor( as.character( data$author ) )

# a simpler data set, with only one entry per commit
simple <- data[ 
    !duplicated(data$revision), 
    c("revision","author","date", "month", "year", "weekday" ) ]

# aggregate the data daily
daydata <- local( {
    tab <- table( simple$date )
    within( data.frame( date = as.Date( names(tab) ), commits = as.integer(tab) ), {
        month <- as.integer( format( date, "%m") )
        weekday  <- weekdays( date )
        year <- as.integer( format( date, "%Y" ) )
    } )
} )

# aggregate data by day and author
day_author_data <- with( simple, aggregate( revision, 
    list( date = date, author = author ), length ) )
names( day_author_data )[3] <- "commits"
day_author_data$date <- as.Date( day_author_data$date )
day_author_data$month <- as.integer(format( day_author_data$date, "%m" ) )
day_author_data$year <- as.integer(format( day_author_data$date, "%Y" ) )

# and monthly
monthdata <- local({
    ym <- sprintf( "%04d-%02d", daydata$year, daydata$month )
    rx <- "^(\\d+)-(\\d+)$"
    ag <- aggregate( daydata$commits, list(date = ym), sum )
    names(ag)[2] <- "commits"
    within( ag, {
        year  <- sub( rx, "\\1", date )
        month <- sub( rx, "\\2", date )
        date  <- as.Date( paste( date, "01", sep = "-" ), format = "%Y-%m-%d" ) 
    } )
})

month_author_data <- local({
    ym <- sprintf( "%04d-%02d", day_author_data$year, day_author_data$month )
    rx <- "^(\\d+)-(\\d+)$"
    ag <- aggregate( day_author_data$commits, list(date = ym, author = day_author_data$author), sum )
    names(ag)[3] <- "commits"
    within( ag, {
        year  <- sub( rx, "\\1", date )
        month <- sub( rx, "\\2", date )
        date  <- as.Date( paste( date, "01", sep = "-" ), format = "%Y-%m-%d" ) 
    } )
})


# calculate the number of files in the distribution at each revision
# (sum of "A"dded files so far minus sum of "D"eleted files so far)
nfiles_data <- data.frame( 
    revision = sort( unique(data$revision ) ),
    nfiles   = cumsum( tapply( data$action, data$revision, function(x){ 
        sum( x == "A", na.rm = TRUE) - sum(x == "D", na.rm = TRUE) 
    } ) ), 
    date     = as.Date( tapply( as.character(data$date), data$revision, head, 1 ) ), 
    author = tapply( data$author, data$revision, head, 1 ) )

    
#' this fills y with 0 for x that are not in the range of dates
#' (thanks to Duncan Murdoch for the suggestion)
#' 
#' @param x dates
#' @param y numbers
#' @param start start date in the extended date vector
#' @param end end date in the extended date vector
#' @param by see ?seq.Date
panel.loess.fill <- function(x, y, by = "day", start = min(x), end = max(x), ...){
    xx <- seq.Date( start, end, by= by )
    yy <- numeric( length(xx) )
    not.zero <- xx %in% x
    if( any( not.zero ) ) yy[ not.zero ] <- y
    loess.out <- loess( as.numeric(yy) ~ as.numeric(xx) )
    panel.lines( xx, predict( loess.out), ... )
}

panel.monthlines <- function( 
    start = as.Date( "2008-01-01" ) , end = as.Date( "2020-09-01" ), 
    lwd = 0.5, col = "gray", ... ){
    
    months <- seq.Date( start, end, by = "month" )
    panel.abline( v = months, col = col, lwd = lwd, ... )
}
    
#' grabs version number, release dates and size of the R distribution
#' for each archive on CRAN
releaseDate <- function( 
    urls = "http://cran.r-project.org/src/contrib/Archive/Rcpp/", 
    pattern = "" ){
    
    rx <- '^.*(Rcpp_.*?gz).*?right">([^\\s>]*?)\\s.*?right">\\s?([^>]*?)<.*$'
    data <- do.call( rbind, lapply( urls, function( url ){
        txt <- grep( rx, readLines( url ), value = TRUE, perl = TRUE )
        parts <- sub( rx, "\\1--\\2--\\3", txt, perl = TRUE )
        do.call( rbind, strsplit( parts, "--" ) )
    } ) )
    colnames( data ) <- c("version", "date", "size")
    
    data <- within( as.data.frame( data ), {
        version <- sub( "^Rcpp_(.*)[.]tar[.]gz.*$", "\\1", version )
        date    <- as.Date( date, format = "%d-%b-%Y" )
        size    <- local({
            K <- grepl( "K$", size)
            x <- numeric( length( size ) )
            x[ K ]  <- as.numeric(sub( "K", "", size[K] )) / 1024
            x[ !K ] <- as.numeric(sub( "M", "", size[!K] ))
            x
        } )
    } )
    data[ grepl( pattern, data$version ), ]
}
lct <- Sys.getlocale("LC_TIME"); Sys.setlocale("LC_TIME", "C")
releases <- rbind( 
	releaseDate(), 
	data.frame( version = "0.8.5", date = as.Date( "2010-07-26" ), size = 1.6 )
	)
releases$major <- grepl( "[.]0$", releases$version )

#' helper function to draw R releases on the top axis
axis.releases <- function(side, ...) {
    switch(side,
        top = {
            panel.axis(side = side, outside = TRUE,
            	at = releases$date[ !releases$major ] , 
            	labels = releases$version[ !releases$major ], 
            	rot = 45)
            
            maj <- which( releases$major )
            for( i in 1:length(maj) )
            	try( panel.axis(side = side, outside = FALSE,
            		at = releases$date[ maj[i] ], 
            		labels = releases$version[ maj[i] ], 
                   rot = 0, text.fontface = "bold" , text.cex = 2), silent = TRUE )
             
        },
        bottom = {
        	months <- seq.Date( as.Date( "2008-01-01" ), as.Date( "2010-09-01" ), by = "month" )
    
        	panel.axis(side = side, outside = TRUE,
                       at = months, labels = substring( format(months, "%b" ), 1, 1 ), rot = 0 )
                       
           try( panel.axis(side = side, outside = FALSE, 
           		at = as.Date("2009-01-01"), labels = "2009", rot = 0, 
           		text.fontface = "bold", text.alpha = .5, text.col = "darkgray", 
           		text.cex = 2 ), silent = TRUE )
           panel.axis(side = side, outside = FALSE, 
           		at = as.Date("2010-01-01"), labels = "2010", rot = 0, 
           		text.fontface = "bold", text.alpha = .5, text.col = "darkgray", 
           		text.cex = 2 )
           				
        }, 
        axis.default(side = side, ...)
    )
}


png( "commits_per_day.png", width = 1500, height = 500 )    
print( xyplot( commits ~ date , data = daydata, ylab = "commits / day",
    panel = function(x, y, ...){
        panel.monthlines( )
        panel.abline( h = seq( 0, 50, by = 5), lwd = .5, col = "gray" )
        panel.loess.fill( x, y, lwd = 5, col = "black", ... )
        panel.xyplot( x, y, pch = "+", ... )
        panel.abline( v = seq.Date( as.Date("2007-01-01"), as.Date("2010-09-01"), by = "month" ), lwd = .5, col = "gray" )
    } #, subset = date > as.Date("2009-10-01")
    , axis = axis.releases
    ) )    
dev.off()   

png( "commits_per_day__zoom.png", width = 1500, height = 500 )    
print( xyplot( commits ~ date , data = daydata, ylab = "commits / day",
    panel = function(x, y, ...){
        panel.monthlines( )
        panel.abline( h = seq( 0, 50, by = 1), lwd = .5, col = "gray" )
        panel.loess.fill( x, y, lwd = 5, col = "black", ... )
        panel.xyplot( x, y, pch = "+", ... )
        panel.abline( v = seq.Date( as.Date("2007-01-01"), as.Date("2010-09-01"), by = "month" ), lwd = .5, col = "gray" )
    }, ylim = c(-1,10), subset = date > as.Date("2009-10-01"), 
    axis = axis.releases
    ) )    
dev.off()   


png( "commits_per_day_per_author.png", width = 1500, height = 500 )    
print( xyplot( commits ~ date | author, data = day_author_data, ylab = "commits / day",
    panel = function(x, y, ...){
        panel.monthlines( )
        panel.abline( h = seq( 0, 50, by = 1), lwd = .5, col = "gray" )
        panel.abline( h = seq( 0, 50, by = 5), lwd = 1 , col = "darkgray" )
        panel.loess.fill( x, y, lwd = 5, col = "black", ... )
        panel.xyplot( x, y, pch = "+", ... )
        panel.abline( v = seq.Date( as.Date("2007-01-01"), as.Date("2010-09-01"), by = "month" ), lwd = .5, col = "gray" )
        
    }, 
    axis = axis.releases
    #,  subset = date > as.Date("2009-10-01"), layout = c(3,1), ylim = c(-1, 10 )
    ) )    
dev.off()    
 

png( "commits_per_day_per_author__zoom.png", width = 1500, height = 500 )    
print( xyplot( commits ~ date | author, data = day_author_data, ylab = "commits / day",
    panel = function(x, y, ...){
        panel.monthlines( )
        panel.abline( h = seq( 0, 20, by = 1), lwd = .5, col = "gray" )
        panel.loess.fill( x, y, lwd = 5, col = "black", ... )
        panel.xyplot( x, y, pch = "+", ... )
        panel.abline( v = seq.Date( as.Date("2007-01-01"), as.Date("2010-09-01"), by = "month" ), lwd = .5, col = "gray" )
    }, ylim = c(-1,10),  subset = date > as.Date("2009-10-01"), layout = c(3,1)
    , axis = axis.releases )  )
dev.off()    

png( "commits_per_month.png", width = 1500, height = 500 )    
print( xyplot( commits ~ date , data = monthdata, ylab = "commits / month",
    panel = function(x, y, ...){
        panel.monthlines() 
        panel.abline( h = seq(0,500, by = 20) , lwd = .5, col = "gray" )
        panel.abline( h = pretty(y), lwd = 1, col = "darkgray" )
        panel.loess.fill( x, y, by = "month", 
            lwd = 2, col = "black", ... )
        panel.xyplot( x, y, pch = 21, fill = "red", cex = 2, ... )
    }, axis = axis.releases ) )
dev.off()    
    
png( "commits_per_month__zoom.png", width = 1500, height = 500 )    
print( xyplot( commits ~ date , data = monthdata, ylab = "commits / month",
    panel = function(x, y, ...){
        panel.monthlines() 
        panel.abline( h = seq(0,500, by = 20) , lwd = .5, col = "gray" )
        panel.abline( h = seq(0,500, by = 100) , lwd = 1, col = "darkgray" )
       
        xx <- seq.Date( min(x), max(x), by = "month" )
        yy <- rep( 0, length(xx) )
        yy[ xx %in% x ] <- y
        panel.lines( xx, yy )
        
       panel.xyplot( xx, yy, pch = 21, fill = "red", cex = 2, ... )
    }, axis = axis.releases,  subset = date > as.Date("2009-10-01") ) )    
dev.off()
    

png( "commits_per_month_author.png", width = 1500, height = 500 )    
print( xyplot( commits ~ date | author, data = month_author_data, ylab = "commits / month",
    panel = function(x, y, ...){
        panel.monthlines() 
        panel.abline( h = seq(0,500, by = 20) , lwd = .5, col = "gray" )
        panel.abline( h = seq(0,500, by = 100) , lwd = 1, col = "darkgray" )
        
        xx <- seq.Date( min(x), max(x), by = "month" )
        yy <- rep( 0, length(xx) )
        yy[ xx %in% x ] <- y
        panel.lines( xx, yy )
        panel.xyplot( xx, yy, pch = 21, fill = "red", cex = 1.5, ... )
    }, axis = axis.releases
    # ,  subset = date > as.Date("2009-10-01") )
    )
)    
dev.off()

png( "commits_per_month_author__zoom.png", width = 1500, height = 500 )    
print( xyplot( commits ~ date | author, data = month_author_data, ylab = "commits / month",
    panel = function(x, y, ...){
        panel.monthlines() 
        panel.abline( h = seq(0,500, by = 20) , lwd = .5, col = "gray" )
        panel.abline( h = seq(0,500, by = 100) , lwd = 1, col = "darkgray" )
        
        xx <- seq.Date( min(x), max(x), by = "month" )
        yy <- rep( 0, length(xx) )
        yy[ xx %in% x ] <- y
        panel.lines( xx, yy )
        panel.xyplot( xx, yy, pch = 21, fill = "red", cex = 1.5, ... )
    }, axis = axis.releases,  subset = date > as.Date("2009-10-01") )
)    
dev.off()