data=cbind(data,class_s,fpc) library(survey) pop.types <- xtabs(~class_s, data=data) I_design=svydesign(id=~1,strata=~class_s,fpc=~fpc,data=data) #C1. Project financing, finance(Q38:yes/no) and f_index(Q39,count yes) n<-dim(data)[1] temp<-apply(is.na(data[,75:83]),1,sum) finance<-rep(NA,n) for (i in 1:n) { if (temp[i]<9) { if (sum(2-data[i,75:83],na.rm=TRUE)>0) finance[i]<-1 if (sum(2-data[i,75:83],na.rm=TRUE)==0) finance[i]<-0} } temp<-apply(is.na(data[,84:113]),1,sum) f_index<-rep(NA,n) for (i in 1:n) { if (temp[i]<30) f_index[i]<-sum(data[i,84:113],na.rm=TRUE) } # C2. Paid versus volunteer participants, paid_volunteer temp<-apply(is.na(data[,75:78]),1,sum) paid_volunteer<-rep(NA,n) for (i in 1:n) { if (temp[i]<4) { if (sum(2-data[i,75:78],na.rm=TRUE)>0) paid_volunteer[i]<-1 if (sum(2-data[i,75:78],na.rm=TRUE)==0) paid_volunteer[i]<-0} } #C3. Developer motivations, need(Q40_7,40_9,45_7,45_9, leisure(45_4,40_4),fin_motive(45_6,40_6),motivation(sum40, 45) temp<-apply(is.na(cbind(data$q_0040_7,data$q_0040_9,data$q_0045_7,data$q_0045_9)),1,sum) need<-rep(NA,n) for (i in 1:n) { if (temp[i]<4) need[i]<-sum(2-cbind(data$q_0040_7,data$q_0040_9,data$q_0045_7,data$q_0045_9)[i,],na.rm=TRUE) } temp<-apply(is.na(cbind(data$q_0040_4,data$q_0045_4)),1,sum) leisure<-rep(NA,n) for (i in 1:n) { if (temp[i]<2) leisure[i]<-sum(2-cbind(data$q_0040_4,data$q_0045_4)[i,],na.rm=TRUE) } temp<-apply(is.na(cbind(data$q_0040_6,data$q_0045_6)),1,sum) fin_motive<-rep(NA,n) for (i in 1:n) { if (temp[i]<2) fin_motive[i]<-sum(2-cbind(data$q_0040_6,data$q_0045_6)[i,],na.rm=TRUE) } temp0=as.numeric(data[,128]) #Q45 for (i in 129:137) {temp0<-cbind(temp0,as.numeric(data[,i]))} temp<-apply(is.na(cbind(data[,114:123],temp0)),1,sum) #Q40 motivation<-rep(NA,n) for (i in 1:n) { if (temp[i]<20) motivation[i]<-sum(2-cbind(data[,114:123],temp0)[i,],na.rm=TRUE) } #C4. The homogeneity/heterogeneity of participants,home_geog(Q14),homo_motive(Q45), homo_edu(Q48) temp<-is.na(data$q_0014) homo_geog<-rep(NA,n) for (i in 1:n) { if (is.na(data$q_0023[i])) {if (data[i,6]>1) {if (temp[i]==0 & sum(data$q_0014[i]==c(1,2,3,4,10))==1) homo_geog[i]<-1 if (temp[i]==0 & sum(data$q_0014[i]==c(5,6,7,8,9,11))==1) homo_geog[i]<-0} } else {if (temp[i]==0 & data$q_0023[i]==1 & sum(data$q_0014[i]==c(1,2,3,4,10))==1) homo_geog[i]<-1 if (temp[i]==0 & data$q_0023[i]==1 & sum(data$q_0014[i]==c(5,6,7,8,9,11))==1) homo_geog[i]<-0 if (temp[i]==0 & data$q_0023[i]==2) homo_geog[i]<-1 } } temp<-apply(is.na(temp0),1,sum) temp0=as.numeric(data[,128]) #Q45 for (i in 129:137) {temp0<-cbind(temp0,as.numeric(data[,i]))} temp<-apply(is.na(temp0),1,sum) #num of NAs in each row of q45 1:10 temp1<-is.na(data$q_0023) homo_motive<-rep(NA,n) for (i in 1:n) { if (temp[i]<10) if(temp1[i]==1) homo_motive[i]<-sum(2-temp0[i,],na.rm=TRUE) if(temp1[i]==0&data$q_0023[i]==1) homo_motive[i]<-sum(2-temp0[i,],na.rm=TRUE) if (temp1[i]==0&data$q_0023[i]==2) homo_motive[i]<-0 #to avoid imputation 1 dev projects have 0 heterogeneity } temp<-is.na(data$q_0048) temp1<-is.na(data$q_0023) homo_edu<-rep(NA,n) for (i in 1:n) { if (temp[i]==0 & data$q_0023[i]==1 & sum(data$q_0048[i]==c(1,2,3))==1) homo_edu[i]<-1 if (temp[i]==0 & data$q_0023[i]==1 & sum(data$q_0048[i]==c(4,5,6,7))==1) homo_edu[i]<-0 if (temp1[i]==0 & data$q_0023[i]==2) homo_edu[i]<-1 } #C5. The skill/knowledge continuum of developers, os_experience(q2,q3), continuity(q7), prof_skill(q41,42/q46,41,47) #education(q43,48),writing(q44,49) os_experience<-apply(cbind((2009-data$q_0002)/max(2009-data$q_0002,na.rm=TRUE),data$q_0003/max(data$q_0003,na.rm=TRUE)),1,sum,na.rm=TRUE) continuity<-2-data$q_0007 for ( i in 1:n) { if (is.na(continuity[i])) continuity[i]<-2-data$q_0007B[i] else {if (continuity[i]==0) {if(!is.na(data$q_0007B[i])) continuity[i]=2-data$q_0007B[i]}} } prof_skill<-rep(NA,n) for (i in 1:n) { if(is.na(data$q_0023[i])|data$q_0023[i]==1)#if Q23 == NA or 1 then Q46 and Q47 were asked {if(is.na(data$q_0046[i])&is.na(data$q_0047[i])) prof_skill[i]<-(2-data$q_0041[i])*(4-data$q_0042[i]) else if(is.na(data$q_0046[i])&!is.na(data$q_0047[i])) prof_skill[i]<-(2-data$q_0041[i])*(4-data$q_0042[i])#don't know if more than 1 professional else if(data$q_0046[i]==1&is.na(data$q_0047[i])) prof_skill[i]<-(2-data$q_0041[i])*(4-data$q_0042[i]) else if(data$q_0046[i]==1&!is.na(data$q_0047[i])) prof_skill[i]<-4-data$q_0047[i] else if (data$q_0046[i]==2) prof_skill[i]<-(2-data$q_0041[i])*(4-data$q_0042[i]) } else{if (data$q_0023[i]==2) prof_skill[i]<-(2-data$q_0041[i])*(4-data$q_0042[i]) } } education<-rep(NA,n) for (i in 1:n) #just take the answer they gave. { if(is.na(data$q_0048[i])) {if (is.na(data$q_0043[i])) education[i]<-NA else {if (data$q_0043[i]<4) education[i]<-4-data$q_0043[i] if (data$q_0043[i]==4) education[i]<-2 if (data$q_0043[i]==5) education[i]<-3 if (data$q_0043[i]==6) education[i]<-1 } } else {if (data$q_0048[i]<4) education[i]<-4-data$q_0048[i] if (data$q_0048[i]==4 || data$q_0048[i]==5 || data$q_0048[i]==7) education[i]<-3 if (data$q_0048[i]==6) education[i]<-2 if (data$q_0048[i]==8) education[i]<-1 } } writing<-rep(NA,n) for (i in 1:n) { if (is.na(data$q_0049[i])) writing[i]<-6-data$q_0044[i] else writing[i]<-6-data$q_0049[i] } #C6. Software modularity, granularity and complexity,Q8,Q10,Q19 proprietary<- 2-data$q_0008 modularity<- 3-data$q_0010 complexity<-5-data$q_0019 #C7. Approaches to requirements gathering,Q11A,Q12,Q11 temp<-apply(is.na(data[,33:39]),1,sum) plan_communicate<-rep(NA,n) for (i in 1:n) {if(temp[i]<7) plan_communicate[i]<- sum(2-data[i,33:39],na.rm=TRUE)/7 } plan_user_involve<-(3-data$q_0012)/2 plan_exist<-2-data$q_0011 z<-cbind(plan_communicate, plan_user_involve, plan_exist) temp<-apply(is.na(z),1,sum) require_index<-rep(NA,n) for (i in 1:n) {if (temp[i]<3) require_index[i]<-sum(z[i,], na.rm=TRUE)} #C8. Some elements of Leadership hrs<-rep(NA,n) for (i in 1:n) {if (!is.na(data$q_0016[i])) {if (data$q_0016[i]==1) hrs[i]<-.05 if (data$q_0016[i]==2) hrs[i]<-3 if (data$q_0016[i]==3) hrs[i]<-6 if (data$q_0016[i]==4) hrs[i]<-9 if (data$q_0016[i]==5) hrs[i]<-11} } temp<-cbind(data$q_0009,data$q_0011,data$q_0018,data$q_0013,hrs,data$q_0020,data$q_0035,data$q_0024) temp0<-apply(is.na(temp),1,sum) leadership<-rep(NA,n) for (i in 1:n) {if (temp0[i]<8) leadership[i]<-sum(sum(2-temp[i,1:3],na.rm=TRUE),(2-temp[i,4]),temp[i,5]/11,sum(6-temp[i,6:7])/5,(3-temp[i,8])/2,na.rm=TRUE) } #C9. Social capital and reciprocity,soc_capital(Q25,50,51),recip_specific(Q26),recip_general(A27) temp<-apply(is.na(cbind(data$q_0025,data$q_0050,data$q_0051)),1,sum) temp1<-is.na(data$q_0023) soci_capital<-rep(NA,n) for (i in 1:n) {if(temp[i]<3) soci_capital[i]<-sum((3-data$q_0025[i])/2,sum(data$q_0050[i]-1,data$q_0051[i]-1,na.rm=TRUE)/5,na.rm=TRUE) else if (temp1[i]==0&data$q_0023[i]==2) soci_capital[i]<-0 #set social capital=0 if the project has only 1 member } recip_specific<-5-data$q_0026 recip_general<-5-data$q_0027 ##C10. Marketing,Q21,35,11A temp<-apply(is.na(cbind(data$q_0021,data$q_0035,plan_communicate)),1,sum) marketing<-rep(NA,n) for(i in 1:n) {if (temp[i]<3) marketing[i]<-sum(2-data$q_0021[i],(6-data$q_0035[i])/5,plan_communicate[i],na.rm=TRUE) } ###C11. Institutional structure and design,Q30,Q31,Q32,Q33,Q34,Q34A temp0<-cbind(2-data$q_0030_1,2-data$q_0030_2,2-data$q_0030_3) temp<-apply(is.na(temp0),1,sum) rule_ops<-rep(NA,n) for (i in 1:n) { if( temp[i]<3) { for (j in 1:3) if (is.na(temp0[i,j]) )temp0[i,j]=4 if (sum(temp0[i,1:3])==0) rule_ops[i]=0 if (sum(temp0[i,1:3])==3) rule_ops[i]=5 if (sum(temp0[i,1:3])==1 & temp0[i,1]==1) rule_ops[i]=1 if (sum(temp0[i,1:3])==1 & temp0[i,2]==1) rule_ops[i]=3 if (sum(temp0[i,1:3])==1 & temp0[i,3]==1) rule_ops[i]=6 if (sum(temp0[i,1:3])==2 & temp0[i,3]==0) rule_ops[i]=2 if (sum(temp0[i,1:3])==2 & temp0[i,2]==0) rule_ops[i]=4 } } rule_decision<-data$q_0031 temp<-data$q_0032 for (i in 1:n) {if (!is.na(temp[i]) &temp[i]=="-oth-") temp[i]=NA } rule_leader<- as.integer(temp)-2 govern<-data$q_0033-1 gov_evolution<-rep(NA,n) temp<-data$q_0034A for (i in 1:n) {if (!is.na(temp[i]) &temp[i]=="-oth-") temp[i]=NA } temp<-as.integer(temp)-1 temp<-cbind(2-data$q_0034, temp) temp0<-apply(is.na(temp),1,sum) for(i in 1:n) if(temp0[i]<2) {if(!is.na(temp[i,1])|temp[i,1]==1) gov_evolution[i]<-temp[i,2] if(temp[i,1]==0) gov_evolution[i]<-0 } ###C12 Other temp<-apply(is.na(cbind(data$q_0036,data$q_0037)),1,sum) utility<-rep(NA,n) for (i in 1:n) {if (temp[i]<2) utility[i]<- sum(4-data$q_0036[i],4-data$q_0037[i],na.rm=TRUE) } complexity<- 5-data$q_0019 competition<- 3-data$q_0022 temp0<-cbind(data$q_0028_1,data$q_0028_2,data$q_0028_3,data$q_0028_4,data$q_0028_5,data$q_0029) temp<-apply(is.na(temp0),1,sum) community<-rep(NA,n) for (i in 1:n) {if (temp[i]<6) community[i]<-sum(sum(2-temp0[i,1:5],na.rm=TRUE)/5,temp0[i,6]/max(data$q_0029,na.rm=TRUE),na.rm=TRUE) } memb_inet<-rep(NA,n) for(i in 1:n) {if(!is.na(data$q_0023[i])&data$q_0023[i]==2) memb_inet[i]<-0 else memb_inet[i]<-2-data$q_0015[i] } hrs16<-rep(NA,n) for(i in 1:n) {if (!is.na(data$q_0016[i])) {if (data$q_0016[i]==1) hrs16[i]<-.05 if (data$q_0016[i]==2) hrs16[i]<-3 if (data$q_0016[i]==3) hrs16[i]<-6 if (data$q_0016[i]==4) hrs16[i]<-9 if (data$q_0016[i]==5) hrs16[i]<-11} } hrs17<-rep(NA,n) for (i in 1:n) {if (!is.na(data$q_0017[i])) {if (data$q_0017[i]==1) hrs17[i]<-.05 if (data$q_0017[i]==2) hrs17[i]<-35 if (data$q_0017[i]==3) hrs17[i]<-8 if (data$q_0017[i]==4) hrs17[i]<-13 if (data$q_0017[i]==5) hrs17[i]<-18 if (data$q_0017[i]==6) hrs17[i]<- 21} } hours<-apply(cbind(hrs16,hrs17),1,sum,na.rm=TRUE) class_s<-as.factor(class_s) data_index<-cbind(class_s,finance,f_index,paid_volunteer,need,leisure,fin_motive,motivation, homo_geog,homo_motive,homo_edu,os_experience,continuity,prof_skill,education,writing, proprietary,modularity,complexity,require_index,leadership, soci_capital,recip_specific,recip_general,marketing,rule_ops,rule_decision,govern,gov_evolution,rule_leader,utility, competition,community,memb_inet,hours) data_index<-data.frame(data_index) data_nd<-read.table("ia_allqs_nd_data.txt",header=TRUE) data_index_nd<-cbind(data_index,data_nd[,-1]) x<-cor(data_index_nd,use="complete.obs")#correction matrix of the index variables x<-cor(data_index,use="pairwise.complete.obs")# bob variation x[1,] # correction with class_s x[1,order(abs(x[1,]),decreasing=TRUE)] # correction with class_s, order them from smallest to largest (in absolute value) #more bob: find highly correlated variables for(i in 1:dim(x)[1]) for (j in 1:dim(x)[2]) if ((abs(x[i,j]) >.5) & (x[i,j]<'1')) {print(c(rownames(x)[i], colnames(x)[j], x[i,j])) } I_design=svydesign(id=~1,strata=~class_s,fpc=~fpc,data=data_index) svychisq(~finance+class_s, I_design) summary(svyglm(f_index~class_s, I_design)) svychisq(~paid_vs_vol+class_s, I_design) summary(svyglm(need~class_s, I_design)) summary(svyglm(leisure~class_s, I_design)) summary(svyglm(fin_motive~class_s, I_design)) summary(svyglm(team_motivation~class_s, I_design)) svychisq(~homo_geog+class_s, I_design) summary(svyglm(homo_motive~class_s, I_design)) svychisq(~homo_edu+class_s, I_design) summary(svyglm(os_experience~class_s, I_design)) svychisq(~continuity+class_s, I_design) summary(svyglm(prof_skill~class_s, I_design)) svychisq(~prof_skill+class_s, I_design) svychisq(~proprietary+class_s, I_design) svychisq(~modularity+class_s, I_design) summary(svyglm(complexity~class_s, I_design)) summary(svyglm(require_index~class_s, I_design)) summary(svyglm(education~class_s, I_design)) svychisq(~education+class_s, I_design) svychisq(~govern+class_s, I_design) svychisq(~gov_evolution+class_s, I_design) summary(svyglm(writing~class_s, I_design)) svychisq(~writing+class_s, I_design) summary(svyglm(leadership~class_s, I_design)) summary(svyglm(soci_capital~class_s, I_design)) svychisq(~recip_specific+class_s, I_design) svychisq(~recip_general+class_s, I_design) summary(svyglm(marketing~class_s, I_design)) summary(svyglm(rule_ops~class_s, I_design)) summary(svyglm(rule_decision~class_s, I_design)) summary(svyglm(rule_leader~class_s, I_design)) summary(svyglm(utility~class_s, I_design)) summary(svyglm(complexity~class_s, I_design)) summary(svyglm(competition~class_s, I_design)) summary(svyglm(community~class_s, I_design)) summary(svyglm(memb_inet~class_s, I_design)) ######classification analysis library(rpart) library(randomForest) library(adabag) source('cartware.R') data_index<-cbind(finance,f_index,paid_volunteer,need,leisure,fin_motive,motivation, homo_geog,homo_motive,homo_edu,os_experience,continuity,prof_skill,education,writing, proprietary,modularity,complexity,require_index,leadership, soci_capital,recip_specific,recip_general,marketing,rule_ops,rule_decision,govern,gov_evolution,rule_leader,utility, competition,community,memb_inet,hours) data_index<-data.frame(data_index) data_index_nd<-cbind(data_index,data_nd[,-1]) class_s<-as.factor(class_s) y0<- data_index_nd #y0 is another copy of the index data with NA y<-na.roughfix(data_index_nd) #y is the copy of data without NA rf_I_index_nd=randomForest(class_s~.,data=y,ntree=500,importance=TRUE,proximity=TRUE,nodesize=6) ww<-round(importance(rf_I_index_nd), 2) varImpPlot(rf_I_index_nd) x11() rf_I_index_nd_naomit=randomForest(class_s~.,data=y,ntree=500,importance=TRUE,proximity=TRUE,nodesize=6) round(importance(rf_I_index_nd_naomit), 2) varImpPlot(rf_I_index_nd_naomit) cart_I_index<- cart(class_s~.,data=y0,method="class",control=rpart.control(xval = 80, minbucket = 1, minsplit = 6, cp = 0)) summary(cart_I_index) var.importance(cart_I_index) ##################logistic regression###################### attach(data_index_nd) v=2-as.numeric(class_s) summary(glm(v~hours+community+leadership+marketing+continuity,family=binomial(link = "logit"))) summary(glm(v~leadership+community+rule_decision+memb_inet+tracker_reports+forum_posts,family=binomial(link = "logit"))) m=dim(data_index_nd)[2] p<-rep(NA,m) for (i in 1:m) {p[i]<-summary(glm(v~data_index_nd[,i],family=binomial(link = "logit")))$coefficients[2,4] } min(abs(p)) which(p==min(abs(p), na.rm=TRUE)) p<-rep(NA,m) q<-c(1:m) for (i in q[-20]) {p[i]<-summary(glm(v~data_index_nd[,20]+data_index_nd[,i],family=binomial(link = "logit")))$coefficients[3,4] } p<-rep(NA,m) for (i in q[-c(20,32)]) {p[i]<-summary(glm(v~data_index_nd[,20]+data_index_nd[,32]+data_index_nd[,i], family=binomial(link = "logit")))$coefficients[4,4] } p<-rep(NA,m) for (i in q[-c(20,32,27)]) {p[i]<-summary(glm(v~data_index_nd[,20]+data_index_nd[,32]+data_index_nd[,27]+data_index_nd[,i], family=binomial(link = "logit")))$coefficients[5,4] }