import cPickle import rpy2.robjects as R IN = open("data.pickle") data = cPickle.load(IN) IN.close() #Create an R data frame. dataframe = {} dataframe['username'] = R.StrVector((data['username'])) dataframe['join_date'] = R.StrVector((data['join_date'])) dataframe['posts'] = R.IntVector((data['posts'])) dataframe['last_visit'] = R.StrVector((data['last_visit'])) dataframe['birthday'] = R.StrVector((data['birthday'])) dataframe['age'] = R.IntVector((data['age'])) MyRDataframe = R.DataFrame(dataframe) print MyRDataframe.colnames #Plot ages hist = R.r.hist R.r.png('~/Desktop/hist.png',width=300,height=300) hist(MyRDataframe[2], main="", xlab="", br=20) R.r['dev.off']() #No built in function #Plot and fit a model for to posts/activity. #First we need to compute how many days the user has been active. activity = R.r(r''' function(x, y) { if (is.na(x) | is.na(y)) NA else { date.1 <- strptime(x, "%m-%d-%Y") date.2 <- strptime(y, "%m-%d-%Y") difftime(date.1, date.2, units='days') } } ''') as_numeric = R.r['as.numeric'] days_active = activity(dataframe['join_date'], dataframe['last_visit']) days_active = R.r.abs(days_active) days_active = as_numeric(days_active) plot = R.r.plot R.r.png('~/Desktop/plot.png', width=300, height=300) plot(days_active, MyRDataframe[3], pch='.') R.r['dev.off']() #Fit a linear model. fm1a = R.Formula('y ~ x') env = fm1a.environment env['y'] = days_active env['x'] = MyRDataframe[3] #stats = importr('lm') fit = R.r.lm(fm1a) summary = R.r.summary(fit) #Now we can program with the results of fit in Python. print summary[3] intercept = summary[3][0] slope = summary[3][1] print "We are in Python now." print "A linear model is POSTS = %f + %f * DAYS_ACTIVE" % (intercept, slope)