deicide Absurdistan Bankistan
Transcription
deicide Absurdistan Bankistan
deicide deicIDE |ˈdeɪɪsʌɪd, ˈdiːɪ-| ! ! The Rise and Fall of Integrated Development Environments The killing of a god Michele Lanza REVEAL | Faculty of Informatics University of Lugano, Switzerland ! Part I Inception ! Part II Ascension ! Part III Transcendence Bankistan R Absurdistan E V E A L de gustibus non est disputandum Tool == Cool Bugatti Veyron 16.4 987 BHP 408 km/h Weapons of Mass Instruction Part I Inception Shelby SuperCars Ultimate Aero TT 1,062 BHP 412 km/h Bugatti Veyron Super Sport 1,184 BHP 431 km/h Volkswagen Golf V GTI ? 408 + 235 = 431 197 BHP 235 km/h Edit Run Compile I D E ntegrated evelopment nvironment It’s time for new IDEas The IDE must DIE Software is a collection of computer programs and related data that provides the instructions for telling a computer what to do and how to do it. In contrast to hardware, software “cannot be touched”. What is Software? Programming is a kind of writing. ! Gerald Weinberg ! Psychology of Computer Programming ! Dorset House, 1971 /***************************************************************************/ /* micro-Max, */ /* A chess program smaller than 2KB (of non-blank source), by H.G. Muller */ /***************************************************************************/ /* version 3.2 (2000 characters) features: */ /* - recursive negamax search */ /* - quiescence search with recaptures */ /* - recapture extensions */ /* - (internal) iterative deepening */ /* - best-move-first 'sorting' */ /* - a hash table storing score and best move */ /* - full FIDE rules (expt minor ptomotion) and move-legality checking */ ! #define F(I,S,N) for(I=S;I<N;I++) #define W(A) while(A) #define K(A,B) *(int*)(T+A+(B&8)+S*(B&7)) #define J(A) K(y+A,b[y])-K(x+A,u)-K(H+A,t) ! ! ! #define U 16777224 struct _ {int K,V;char X,Y,D;} A[U]; /* hash table, 16M+8 entries*/ int V=112,M=136,S=128,I=8e4,C=799,Q,N,i; /* V=0x70=rank mask, M=0x88 */ ! ! /* piece symbols on printout*/ char O,K,L, w[]={0,1,1,3,-1,3,5,9}, /* relative piece values */ o[]={-16,-15,-17,0,1,16,0,1,16,15,17,0,14,18,31,33,0, /* step-vector lists */ 7,-1,11,6,8,3,6, /* 1st dir. in o[] per piece*/ 6,3,5,7,4,5,3,6}, /* initial piece setup */ b[129], /* board: half of 16x8+dummy*/ T[1035], /* hash translation table */ n[]=".?+nkbrq?*?NKBRQ"; programming is writing programming is writing D(k,q,l,e,J,Z,E,z,n) /* recursive minimax search, k=moving side, n=depth*/ int k,q,l,e,J,Z,E,z,n; /* (q,l)=window, e=current eval. score, E=e.p. sqr.*/ { /* e=score, z=prev.dest; J,Z=hashkeys; return score*/ int j,r,m,v,d,h,i=9,F,G; char t,p,u,x,y,X,Y,H,B; struct _*a=A; /* lookup pos. in hash table*/ j=(k*E^J)&U-9; /* try 8 consec. locations */ W((h=A[++j].K)&&h-Z&&--i); /* first empty or match */ a+=i?j:0; /* dummy A[0] if miss & full*/ if(a->K) /* hit: pos. is in hash tab */ {d=a->D;v=a->V;X=a->X; /* examine stored data */ if(d>=n) /* if depth sufficient: */ {if(v>=l|X&S&&v<=q|X&8)return v; /* use if window compatible */ d=n-1; /* or use as iter. start */ }X&=~M;Y=a->Y; /* with best-move hint */ Y=d?Y:0; /* don't try best at d=0 */ }else d=X=Y=0; /* start iter., no best yet */ N++; /* node count (for timing) */ W(d++<n|z==8&N<1e7&d<98) /* iterative deepening loop */ {x=B=X; /* start scan at prev. best */ Y|=8&Y>>4; /* request try noncastl. 1st*/ m=d>1?-I:e; /* unconsidered:static eval */ do{u=b[x]; /* scan board looking for */ if(u&k) /* own piece (inefficient!)*/ {r=p=u&7; /* p = piece type (set r>0) */ j=o[p+16]; /* first step vector f.piece*/ W(r=p>2&r<0?-r:-o[++j]) /* loop over directions o[] */ {A: /* resume normal after best */ y=x;F=G=S; /* (x,y)=move, (F,G)=castl.R*/ do{H=y+=r; /* y traverses ray */ if(Y&8)H=y=Y&~M; /* sneak in prev. best move */ if(y&M)break; /* board edge hit */ software is text if(p<3&y==E)H=y^16; /* shift capt.sqr. H if e.p.*/ t=b[H];if(t&k|p<3&!(r&7)!=!t)break; /* capt. own, bad pawn mode */ i=99*w[t&7]; /* value of capt. piece t */ if(i<0||E-S&&b[E]&&y-E<2&E-y<2)m=I; /* K capt. or bad castling */ if(m>=l)goto C; /* abort on fail high */ if(h=d-(y!=z)) /* remaining depth(-recapt.)*/ {v=p<6?b[x+8]-b[y+8]:0; /* center positional pts. */ b[G]=b[H]=b[x]=0;b[y]=u&31; /* do move, strip virgin-bit*/ if(!(G&M)){b[F]=k+6;v+=30;} /* castling: put R & score */ if(p<3) /* pawns: */ {v-=9*(((x-2)&M||b[x-2]!=u)+ /* structure, undefended */ ((x+2)&M||b[x+2]!=u)-1); /* squares plus bias */ if(y+r+1&S){b[y]|=7;i+=C;} /* promote p to Q, add score*/ } v=-D(24-k,-l-(l>e),m>q?-m:-q,-e-v-i, /* recursive eval. of reply */ J+J(0),Z+J(8)+G-S,F,y,h); /* J,Z: hash keys */ v-=v>e; /* delayed-gain penalty */ if(z==9) /* called as move-legality */ {if(v!=-I&x==K&y==L) /* checker: if move found */ {Q=-e-i;O=F;return l;} /* & not in check, signal */ v=m; /* (prevent fail-lows on */ } /* K-capt. replies) */ b[G]=k+38;b[F]=b[y]=0;b[x]=u;b[H]=t; /* undo move,G can be dummy */ if(Y&8){m=v;Y&=~8;goto A;} /* best=1st done,redo normal*/ if(v>m){m=v;X=x;Y=y|S&G;} /* update max, mark with S */ } /* if non castling */ t+=p<5; /* fake capt. for nonsliding*/ if(p<3&6*k+(y&V)==S /* pawn on 3rd/6th, or */ ||(u&~24)==36&j==7&& /* virgin K moving sideways,*/ G&M&&b[G=(x|7)-(r>>1&7)]&32 /* 1st, virgin R in corner G*/ &&!(b[G^1]|b[G^2]) /* 2 empty sqrs. next to R */ ){F=y;t--;} /* unfake capt., enable e.p.*/ }W(!t); /* if not capt. continue ray*/ }}}W((x=x+9&~M)-B); /* next sqr. of board, wrap */ C:if(m>I/4|m<-I/4)d=99; /* mate is indep. of depth */ m=m+I?m:-D(24-k,-I,I,0,J,Z,S,S,1)/2; /* best loses K: (stale)mate*/ if(!a->K|(a->X&M)!=M|a->D<=d) /* if new/better type/depth:*/ {a->K=Z;a->V=m;a->D=d;A->K=0; /* store in hash,dummy stays*/ a->X=X|8*(m>q)|S*(m<l);a->Y=Y; /* empty, type (limit/exact)*/ } /* encoded in X S,8 bits */ /*if(z==8)printf("%2d ply, %9d searched, %6d by (%2x,%2x)\n",d-1,N,m,X,Y&0x77);*/ } if(z&8){K=X;L=Y&~M;} return m; } ! main() { int j,k=8,*p,c[9]; ! F(i,0,8) {b[i]=(b[i+V]=o[i+24]+40)+8;b[i+16]=18;b[i+96]=9; F(j,0,8)b[16*j+i+8]=(i-4)*(i-4)+(j-3.5)*(j-3.5); } F(i,M,1035)T[i]=random()>>9; ! W(1) {F(i,0,121)printf(" %c",i&8&&(i+=7)?10:n[b[i]&15]); p=c;W((*p++=getchar())>10); N=0; if(*c-10){K=c[0]-16*c[1]+C;L=c[2]-16*c[3]+C;}else D(k,-I,I,Q,1,1,O,8,0); F(i,0,U)A[i].K=0; if(D(k,-I,I,Q,1,1,O,9,2)==I)k^=24; } /* initial board setup*/ /* center-pts table */ /*(in unused half b[])*/ /* play loop /* print board /* read input line */ */ */ /* parse entered move */ /* or think up one */ /* clear hash table */ /* check legality & do*/ } System Package/Subsystem/Namespace Constructing What is the ideal IDE? Class Description Writing Method/Function Signature Method/Function Body Writing Line/Statement Code Bubbles (Courtesy of Steve Reiss Brown University, USA) Code Canvas ! (Courtesy of Rob De Line Microsoft Research, USA) Gaucho by Fernando Olivero ! Flexitools 2010, ECOOP 2011, VL/HCC 2011 ! The programmer, like the poet, works only slightly removed from thought-stuff. He builds his castles in the air, from air, creating by exertion of the imagination. ! Frederick P. Brooks, Jr. ! ! The Mythical Man-Month ! Addison-Wesley, 1975 ThrustSSC We acquire more information through vision than through all the other senses combined. ! Colin Ware Part II ! ! Information Visualization Ascension Perception for Design ! Morgan Kaufmann, 2004 110,000 BHP 1,227 km/h UML is software g n o visualization wr (software) visualization myths a picture is worth g n o a thousand wr words visual programming g is n o software wrvisualization g n o wr static visualization g n o wr software visualization g n o wr dynamic visualization about software William Playfair Jacques Bertin 1759 - 1823 Semiology of Graphics,1967 software information of software visualization visualization visualization ~ story Stephen Few Show Me the Numbers ! Analytics Press, 2004 Edward Tufte The Visual Display of Quantitative Information ! Morgan Kaufmann, 2004 It was the twenty-eight of November. An immense confused mass of men, horses, vehicles besieged the narrow entrances to the bridges and began to flow over them. Those in front, pushed by the weight of those behind were crushed, trampled on, or forced into the ice-filled water of the Berezina. The confusion was so great that when Napoleon himself wished to cross, it was necessary to use force to clear a passage. Some there were who, determined to pass at all costs, cut a horrible way for themselves with their swords. Others opened an even crueler road for their carriages, driving them pitilessly through the helpless crowd, crushing men and women, in their odious greed sacrificing their companions in misery. Metaphors Software is intangible, having no physical shape or size. Thomas Ball, Stephen Eick ! Software Visualization in the Large ! In Computer, vol. 29, no. 4, pp. 33-43, IEEE Computer Society Press, 1996 Habitability is the characteristic of source code that enables programmers [..] to understand its construction and intentions and to change it comfortably and confidently. Software systems as cities is a versatile metaphor which enables the creation of efficient software visualizations to support reverse engineering. ! Richard Wettel ! Richard P. Gabriel ! ! ! ! Software Systems as Cities Patterns of Software ! PhD Thesis, University of Lugano, 2010 ! Oxford University Press, 1998 Software Cities The City Metaphor Program Comprehension System Language ScummVM C++ ArgoUML package ~ district nesting level ~ color class ~ building methods (NOM) ~ height attributes (NOA) ~ width, length lines (LOC) ~ color LOC 136'325 NOP 141 NOC kLOC 3'117 305 System Language ScummVM C++ CodeCity Smalltalk NOP NOC kLOC System 141 3'117 305 ScummVM C++ 34 173 29 CodeCity Language Smalltalk iTextSharp C# System Language NOC kLOC System 141 3'117 305 ScummVM C++ 34 173 29 CodeCity iTextSharp C# 22 485 58 iText Java 36 566 Jmol Java 50 558 ScummVM C++ CodeCity System Smalltalk Language NOP System 3'117 305 ScummVM C++ 34 173 29 CodeCity 22 485 58 305 34 173 29 iTextSharp C# 22 485 58 iText Java 36 566 59 Language Smalltalk System ScummVM C++ 34 173 29 CodeCity iTextSharp C# 22 485 58 59 iText Java 36 566 85 Jmol Java 50 jEdit Java 59 System 305 ScummVM C++ 34 173 29 CodeCity iTextSharp C# 22 485 58 iText Java 36 566 Jmol Java 50 jEdit Java ArgoUML GWT NOP NOP NOC kLOC 141 3'117 305 34 173 29 iTextSharp C# 22 485 58 59 iText Java 36 566 59 558 85 Jmol Java 50 558 85 966 98 jEdit Java 59 966 98 ArgoUML Java 88 1'817 144 Language Smalltalk NOC kLOC System 141 3'117 305 ScummVM C++ 34 173 29 CodeCity iTextSharp C# 22 485 58 59 iText Java 36 566 558 85 Jmol Java 50 59 966 98 jEdit Java Java 88 1'817 144 ArgoUML Java Java 302 4'372 212 GWT JBoss Smalltalk Language NOC kLOC 3'117 305 NOC kLOC NOP 141 NOC kLOC Smalltalk NOP Language 3'117 3'117 CodeCity NOC kLOC 141 141 141 ScummVM C++ NOP Language NOP NOP NOC kLOC 141 3'117 305 34 173 29 iTextSharp C# 22 485 58 59 iText Java 36 566 59 558 85 Jmol Java 50 558 85 59 966 98 jEdit Java 59 966 98 88 1'817 144 ArgoUML Java 88 1'817 144 Java 302 4'372 212 GWT Java 302 4'372 212 Java 1'507 7'881 435 JBoss Java 1'507 7'881 435 JDK 1.5 Java Smalltalk Smalltalk 664 12'888 1'085 iText System Language ScummVM C++ NOP NOC kLOC Java 59 kLOC iText Eclipse CodeCity C# Java Smalltalk 58 kLOC 2,871 kLOC 29 kLOC System Language 141 3'117 305 34 173 29 CodeCity iTextSharp C# 22 485 58 iText Java 36 566 59 Jmol Java 50 558 85 jEdit Java 59 966 98 ArgoUML Java 88 1'817 144 GWT Java 302 4'372 212 JBoss Java 1'507 7'881 435 JDK 1.5 Java 664 12'888 1'085 Eclipse Java 1'800 27'900 2'871 CodeCity Smalltalk NOP ScummVM C++ ArgoUML Java jEdit 144 kLOC NOC kLOC 141 3'117 305 34 173 29 iTextSharp C# 22 485 58 iText Java 36 566 59 Jmol Java 50 558 85 jEdit Java 59 966 98 ArgoUML Java 88 1'817 144 GWT Java 302 4'372 212 JBoss Java 1'507 7'881 435 JDK 1.5 Java 664 12'888 1'085 Eclipse Java 1'800 27'900 2'871 Smalltalk Java 98 kLOC Jmol Java JDK 85 kLOC Java ScummVM 1,085 kLOC C++ 305 kLOC JBoss Java 435 kLOC GWT Java 212 kLOC OpenSwing Java Software Evolution Analysis Ver. 0.12 18/08/2003 Ver. 0.10.1 9/10/2002 CPPParser JavaRecognizer Ver. 0.16 19/07/2004 Facade FacadeMDRImpl Facade 112 ModelFacade JavaTokenType NSUMLModelFacade 1'754 Coarse-grained Age Map Ver. 0.14 5/12/2003 ModelFacade 268 FacadeMDRImpl Facade NSUMLModelFacade Ver. 0.18.1 30/04/2005 language.java.generator AGE: 8 NOM: 0 NOA: 146 reveng.classfile AGE: 3 NOM: 200 NOA: 85 language.java.generator AGE: 8 NOM: 91 NOA: 24 STDCTokenType reveng.classfile AGE: 3 NOM: 0 NOA: 152 Ver. 0.20 9/02/2006 JavaRecognizer language.java.generator AGE: 8 NOM: 176 NOA: 79 Ver. 0.23.4 10/12/2006 Ver. 0.22 8/08/2006 Ver. 0.24 12/02/2007 Fine-grained Age Map JavaTokenType 1 reveng.java AGE: 8 NOM: 0 NOA: 175 2 3 4 5 6 Coarse-grained Time Travel age: 1 2 3 4 5 6 7 8 updated often, rather unstable stable highly unstable ArgoUML years major releases 6 8 rarely updated 1 2 3 4 5 6 7 8 very old old young very old 0.10.1 2002 0.12 0.14 2003 0.16 2004 0.18.1 2005 0.20 0.22 2006 0.24 2007 7 8 ModelFacade model LOC: 1'280 NOM: 183 NOA: 60 0.10.1 2002 0.12 0.14 2003 0.16 2004 0.18.1 0.20 2005 0.22 0.24 2006 0.10.1 2007 2002 0.12 0.14 2003 0.16 2004 0.18.1 0.20 2005 0.22 0.24 2006 0.10.1 2007 2002 0.12 0.14 2003 0.16 0.18.1 2004 2005 NSUMLModelFacade model LOC: 3'275 NOM: 426 NOA: 108 0.10.1 2002 0.12 0.14 2003 0.16 2004 0.18.1 0.20 2005 0.22 0.24 2006 0.10.1 2007 2002 0.12 0.14 2003 0.16 2004 Facade FacadeMDRImpl 0.18.1 0.20 2005 0.22 Facade model LOC: NOM: NOA: 2002 0.12 0.14 2003 0.16 2004 0.18.1 2005 0.20 0.22 2006 0.24 0.24 2006 2007 2002 0.12 0.14 2003 0.16 2004 0.18.1 2005 0.20 Facade 0.22 2006 2006 2007 Facade model LOC: 2'709 NOM: 329 NOA: 2 0.10.1 2007 FacadeMDRImpl 0.10.1 0.24 FacadeMDRImpl 0 306 1 2002 0.12 0.14 2003 0.16 0.18.1 2004 0.20 2005 0.22 0.24 2006 2007 Fine-grained Time Travel JHotDraw years revisions 0.10.1 0.22 NSUMLModelFacade model LOC: 2'102 NOM: 316 NOA: 2 ModelFacade 0.20 5 8 0.24 2007 2000 2001 2002 2003 2004 3 1 2000 2001 2002 2003 2004 2000 2001 2002 2003 2 2 1 1 2004 2000 2001 2002 2003 2004 6 2000 2001 2002 2003 5 5 4 4 4 3 3 3 2 2 2 1 1 1 2004 2000 2001 2002 2003 2004 2000 8 2000 2001 2002 2003 2004 7 7 6 6 5 5 4 4 3 3 2 2 1 1 2000 2001 2002 2003 2004 2001 2002 2003 2004 Problem Detection & Analysis Metrics? Don’t trust them ! Michele Lanza & Radu Marinescu ! ! Object-Oriented Metrics in Practice Using Software Metrics to Characterize, Evaluate, and Improve the Design of Object-Oriented Systems ! ArgoUML classes Springer, 2006 brain class 8 god class 30 god + brain 6 data class 17 unaffected 1'715 JDK 1.5 god classes Facade NOM 140/337 AggregationKind NOM 3/3 VisibilityKind NOM 4/4 PseudostateKind NOM 6/6 Model NOM 28/44 Shotgun surgery JMol Feature Envy useful? CodeCity codecity.inf.usi.ch Interlude Chains of Academia “I am not convinced I would learn anything from the visualizations” In the Name of Science SE C D I TE C JE RE Design Execution Knock me down, I’ll just come back running Results Design State of the Art Survey Experiment Design Desiderata Experiment Design Desiderata 1 Avoid comparing using a technique against not using it. 1 Avoid comparing using a technique against not using it. 2 Involve participants from the industry. 2 Involve participants from the industry. 3 Provide a not-so-short tutorial of the experimental tool to the participants. 3 Provide a not-so-short tutorial of the experimental tool to the participants. 4 Avoid, whenever possible, giving the tutorial right before the experiment. 4 Avoid, whenever possible, giving the tutorial right before the experiment. 5 Use the tutorial to cover both the research behind the approach and the tool. 5 Use the tutorial to cover both the research behind the approach and the tool. 6 Find a set of relevant tasks. 6 Find a set of relevant tasks. 7 Choose real object systems that are relevant for the tasks. 7 Choose real object systems that are relevant for the tasks. 8 Include more than one object system in the design. 8 Include more than one object system in the design. 9 Provide the same data to all participants. Finding a Baseline 9 Provide the same data to all participants. 10 Limit the amount of time allowed for solving each task. 10 Limit the amount of time allowed for solving each task. 11 Provide all the details needed to make the experiment replicable. 11 Provide all the details needed to make the experiment replicable. 12 Report results on individual tasks. 12 Report results on individual tasks. 13 Include tasks whose expected result is not to the advantage of the tool being evaluated. 13 Include tasks whose expected result is not to the advantage of the tool being evaluated. 14 Take into account the possible wide range of experience level of the participants. 14 Take into account the possible wide range of experience level of the participants. 1. program comprehension 2. design quality assessment 3. system evolution analysis Research Questions 1 Research Questions 1 2 Does the use of CodeCity increase the correctness of the solutions to program comprehension tasks, compared to non-visual exploration tool, regardless of the object system size? Tasks Tasks Does the use of CodeCity increase the correctness of the solutions to program comprehension tasks, compared to non-visual exploration tool, regardless of the object system size? Does the use of CodeCity reduce the time needed to solve program comprehension tasks, compared to non-visual exploration tools, regardless of the object system size? A1 Identity the convention used in the system to organize unit tests. A2.1& A2.2 What is the spread of term T in the name of the classes, their attributes and methods? A3 Evaluate the change impact of class C, in terms of intensity and dispersion. A4.1 Find the three classes with the highest number of methods. A4.2 Find the three classes with the highest average number of lines of code per method. B1.1 Identify the package with the highest percentage of god classes. B1.2 Identify the god class with the largest number of methods. B2.1 Identify the dominant (affecting the highest number of classes) class-level design problem. B2.2 Write an overview of the class-level design problems in the system. Tasks Program Comprehension 6 Quantitative Identity the convention used in the system to organize unit tests. A1 Identity the convention used in the system to organize unit tests. A2.1& A2.2 What is the spread of term T in the name of the classes, their attributes and methods? A2.1& A2.2 What is the spread of term T in the name of the classes, their attributes and methods? A3 Evaluate the change impact of class C, in terms of intensity and dispersion. A3 Evaluate the change impact of class C, in terms of intensity and dispersion. A4.1 Find the three classes with the highest number of methods. A4.1 Find the three classes with the highest number of methods. A4.2 Find the three classes with the highest average number of lines of code per method. A4.2 Find the three classes with the highest average number of lines of code per method. B1.1 Identify the package with the highest percentage of god classes. B1.1 Identify the package with the highest percentage of god classes. B1.2 Identify the god class with the largest number of methods. B1.2 Identify the god class with the largest number of methods. B2.1 Identify the dominant (affecting the highest number of classes) class-level design problem. B2.1 Identify the dominant (affecting the highest number of classes) class-level design problem. B2.2 Write an overview of the class-level design problems in the system. B2.2 Write an overview of the class-level design problems in the system. Design Quality Assessment 4 Experiment Runs Qualitative Testing the Waters Execution 9 A1 1 Experiment Timeline 2009 November day 1 day 2 day 3 day 4 18 24 25 2 9 3 1 1 1 November 18 24 25 1 1 1 Lugano 1 2 9 3 1 1 1 21 January 28 5 8 14 February 28 ... April 18 22 24 25 1 1 3 14 1 1 1 remote sessions Bologna experiment session (2 hours) December time Lugano training session (1 hour) 2009 2010 December e1 e2 c1 c2 e3 Antwerp c4 2 1 1 1 1 6 1 5 6 remote session Bern 4 6 1 A.5 Data On Replicability 45 Background Age 40 Code 30 a Introduction Country of origin 18 12 T1 Participant: T1 experiment is to compare practitioners Participant: Introduction analyzing medium to large-scaletool efficiency in supporting software software systems. You will use The aim of thisCodeCity to analyze Azureus, experiment is to a BitTorrent client compare tool written in Java. efficiency in supporting medium to large-scale given maximum software Introduction 100 minutes for software systems. solving 10 tasks YouYou (10 minutes per willare useasked: CodeCity analyze task). Azureus, The aim of this experiment to is to compare BitTorrentinclient tool aefficiency supporting software • not to consult written in Java. any other practitioners analyzing medium You• are to large-scale participantsoftware systems. given maximum Introduction to perform the experiment; minutes for during the tasks100 in the specifiedsolving • to write down order; 10 tasks (10 minutes per task). the You will use CodeCity current toisanalyze Azureus, areexperiment a efficiency BitTorrent time client asked: written insoftware The aim ofYou thisafter to compare tool in supporting Java. each time before completing all • not tasks; software systems.starting to read a task consult practitioners medium to the large-scale •analyzing totoannounce any other and once participant during the You are experimente maximum 100 minutes • togiven forr that the solving perform tasks experiment; (10 minutes per task). the tasks to reset your you10 in the specified are moving 10-minutesorder; on • to writetodown another task, per-task You will use CodeCity to analyze Azureus, a BitTorrent client written into Java. • not the current allocated in order time each time timer; You areafter asked: return to earlier tasks because it before • forcompleting to read a task and each task,all affectsstarting the timing; to the fill once • not any100 intasks; other participant the • totoconsult during10 announce thetasks experiment; You are given maximum minutes forrequired solving (10 minutes per task). check the information. experimenter the most that the case of multiple you are movingInon • to perform tasks in appropriate the specifiedanswer to requested. resetthe order; your 10-minutes-per to another task, inchoices and provide additional -task order allocated • toasked: theto current time each not down •write time before timer; to return starting to read a information, You are task and once earlier tasks because if after completing all the affects tasks; during the itexperiment; • for anytask, other Theeach • not to consult toparticipant experiment in the required information. the timing; isfillconcluded to announce the check youaare moving In onthe with to another the the tasks inexperimenter the specifiedthat order; task, in order most • to •perform case of multiple short appropriate debriefing answer choices to reset questionnaire andtimer; your provide to allocated additional down the10-minutes-per-task current time each time before starting read a task and.once • to write requested. information, if to returnalltothe • not earlier tasks because it affects the timing; after completing tasks; forThe each task, to fill in the required experiment case of multiple the experimenter that youinformation. are moving In onthe to another task, in order Thank • to •announce choices is concluded you with and participating short check the10-minutes-per-task mostfor appropriate debriefing answer in provide additional thisa experiment! information, if to reset your allocated timer; questionnaire. requested. to because it affects the timing; • not to return earlier tasks Richard the case of multiple choices Wettel, In • for each task, to fill in the required information. Michele Lanza, Romain TheThank experiment concluded withand a short debriefing you appropriate questionnaire. forisparticipating check the most answer provide additional information, ifRobbes in this experiment! requested. Richard Wettel, Michele Lanza, Romain Robbes The experiment is concluded with a short debriefing questionnaire. Thank you for participating in this experiment! Participant: practitioners You are analyzing Richard Wettel, Michele Lanza, Romain Robbes Thank you for participating in this experiment! Richard Wettel, Michele Lanza, Romain Robbes 174 A.4.1 T1: Azureus, analyzed A.4 Task Solution with CodeCity Oracles A.2. The assig A.5 Data A1 A.4 Task Solution Oracles A.4.1 T1: Azureus, analyzed with CodeCity Either 174 A.4 Task Solution Oracles A1 There are no unit tests in the system [1pt], or Either A.4.1 T1: Azureus, Centralized A.4 Task Solution Oracles analyzed with CodeCity There are no unit in a single package hierarchy tests in the system [1pt]. Since whose root is there is only one [1pt], A1 or in org.gudy.azure test class (i.e., answer, the answer us2.ui.console. TestUserManage Centralized T1: Azureus, analyzed with CodeCity is multiuser r), if they don’t in a single wrong. Either packagecompletely give the full correct hierarchy whose [1pt]. Since there root is in org.gudy.azureus2. is only There are no unitA2.1 one test tests in the system [1pt], class (i.e., TestUserManager), ui.console.multiuse answer, the answer is completely r if they don’t give or wrong. the full correct Either Centralized Dispersed in a single [0pts package hierarchy whose root is in org.gudy.azureus2.ui.console.multiuser otherwise] There are no unit tests in in the following system [1pt], A2.1 [1pt]. Since there isthe only one test (max. class (i.e., TestUserManager), if they don’t give the full correct 5) packages or [0.2pts for each]: answer, the answer is completely wrong. Dispersed [0pts • com.aelitis.azur otherwise] Centralized in a single package hierarchy whose root is in org.gudy.azureus2.ui.console.multiuser eus.core in the following [1pt]. Since there is only one test(max. class (i.e., TestUserManager), if they don’t give the full correct 5) packages [0.2pts for • com.aelitis.azur A2.1 answer, the answer•is completely wrong. eus.core.conten each]: com.aelitis.azureus. t core Dispersed [0pts otherwise] • com.aelitis.azur eus.core.downlo • com.aelitis.azureus. A2.1 in the following (max. 5) packages [0.2pts for each]: ad core.content • com.aelitis.azur eus.core.impl • com.aelitis.azureus. Dispersed•[0pts otherwise] com.aelitis.azureus.core core.download com.aelitis.azur in the following (max. 5) •packages [0.2pts eus.core.lws for each]: • com.aelitis.azureus. • com.aelitis.azureus.core.content core.impl • com.aelitis.azur • com.aelitis.azureus.core eus.core.peerma • com.aelitis.azureus. • com.aelitis.azureus.core.download nager.peerdb core.lws • com.aelitis.azur • com.aelitis.azureus.core.content eus.core.stats • com.aelitis.azureus. • com.aelitis.azureus.core.impl core.peermanager.p • com.aelitis.azur eerdb • com.aelitis.azureus.core.download eus.core.torrent • com.aelitis.azureus. • com.aelitis.azureus.core.lws core.stats • com.aelitis.azur • com.aelitis.azureus.core.impl eus.plugins.net. • com.aelitis.azureus. buddy • com.aelitis.azureus.core.peermanager.peerdb core.torrent • com.aelitis.azur • com.aelitis.azureus.core.lws eus.plugins.net. • com.aelitis.azureus. • com.aelitis.azureus.core.stats plugins.net.buddy buddy.swt • com.aelitis.azur • com.aelitis.azureus.core.peermanager.peerdb eus.plugins.net. • com.aelitis.azureus. buddy.tracker • com.aelitis.azureus.core.torrent plugins.net.buddy.sw • com.aelitis.azur t • com.aelitis.azureus.core.stats • com.aelitis.azureus. eus.plugins.removerules • com.aelitis.azureus.plugins.net.buddy plugins.net.buddy.tr • com.aelitis.azur acker • com.aelitis.azureus.core.torrent • com.aelitis.azureus. eus.plugins.sharing.hoster • com.aelitis.azureus.plugins.net.buddy.swt plugins.removerules • com.aelitis.azur • com.aelitis.azureus.plugins.net.buddy • com.aelitis.azureus. eus.plugins.startstoprules.defau • com.aelitis.azureus.plugins.net.buddy.tracker plugins.sharing.host ltplugin • com.aelitis.azur er • com.aelitis.azureus.plugins.net.buddy.swt • com.aelitis.azureus. eus.plugins.tracker.dht • com.aelitis.azureus.plugins.removerules plugins.startstoprule • com.aelitis.azur s.defaultplugin • com.aelitis.azureus.plugins.net.buddy.tracker • com.aelitis.azureus. eus.plugins.tracker.local • com.aelitis.azureus.plugins.sharing.hoster plugins.tracker.dht • com.aelitis.azur • com.aelitis.azureus.plugins.removerules • com.aelitis.azureus. eus.plugins.tracker.peerauth • com.aelitis.azureus.plugins.startstoprules.defaultplugin plugins.tracker.local • com.aelitis.azur • com.aelitis.azureus.plugins.sharing.hoster • com.aelitis.azureus. eus.ui.swt.content.columns • com.aelitis.azureus.plugins.tracker.dht plugins.tracker.peer • com.aelitis.azur auth • com.aelitis.azureus.plugins.startstoprules.defaultplugin • com.aelitis.azureus. eus.ui.swt.shells.main • com.aelitis.azureus.plugins.tracker.local ui.swt.content.colum • com.aelitis.azur ns • com.aelitis.azureus.plugins.tracker.dht • com.aelitis.azureus. eus.util • com.aelitis.azureus.plugins.tracker.peerauth ui.swt.shells.main • com.aelitis.azureus.plugins.tracker.local • com.aelitis.azureus. • com.aelitis.azureus.ui.swt.content.columns util • com.aelitis.azureus.plugins.tracker.peerauth • com.aelitis.azureus.ui.swt.shells.main • com.aelitis.azureus.ui.swt.content.columns • com.aelitis.azureus.util • com.aelitis.azureus.ui.swt.shells.main 174 A.4.1 A1 • com.aelitis.azureus.util 194 ta CodeCity Experiment CodeCityTheExperiment aim of this Explore Italy Belgium Switzerand Argentina Canada Germany Romania T1 Da industryadvanced ent 5 academyadvanced 174 A. academybeginner EXAMINE VARIABLES=Age BY Syssize BY Tool /PLOT=BOXPLOT /STATISTICS=NONE /NOTOTAL. 7 T1 Compl. Time Total (excl. A4.2) Per Task B2.1 B1.2 A.5 48.55 Data Completion Time B1.1 A4.2 52.97 A4.1 1.02 A3 1.90 34.67 9.85 A2.2 44.67 4.42 A2.1 Code 2.33 A1 7.28 2.92 10.00 37.97 5.25 5.38 47.33 10.00 4.32 1.83 5.33 8.80 2.95 8.58 45.50 IA01 2.55 1.75 55.50 9.37 6.08 1.08 5.62 2.42 1.33 9.43 32.83 IA02 10.00 2.42 42.83 10.00 6.92 2.58 7.92 6.25 2.25 6.08 43.25 IA03 6.92 4.00 53.25 10.00 6.00 1.67 6.25 9.08 2.58 4.75 27.08 IA04 9.42 3.92 37.08 10.00 4.83 2.17 8.08 1.33 1.58 4.25 36.00 AB01 3.75 1.67 46.00 10.00 5.58 1.42 3.50 10.00 1.33 4.75 45.92 AB02 5.17 1.33 55.67 10.00 3.67 1.33 3.17 6.33 3.25 8.75 44.67 IA05 8.33 2.08 54.67 9.75 4.42 1.50 5.00 9.67 1.33 10.00 AA01 36.25 7.33 Correctness Per Task Correctness 2.50 10.00 46.25 7.17 Total2.33 8.33 10.00 A4.24.17B1.1 AA02 24.67 A1 A2.1 A2.2 A3 B2.1 1.42 (excl. A4.2) 4.42 A4.1 34.67 10.00B1.2 2.42 5.92 1.17 4.58 10.00 IA06 1.00 39.50 6.25 1.006.33 0.00 0.00 1.00 0.0010.001.00 3.33 1.00 4.006.00 1.42 6.00 47.33 5.67 1.00 3.75 7.25 2.50 37.58 IA07 0.80 3.67 2.95 1.004.55 0.00 1.00 0.50 1.00 6.30 1.25 6.30 46.08 2.25 1.00 5.250.00 7.831.00 2.67 1.50 10.00 37.83 IA08 7.17 0.00 0.00 1.00 0.00 5.000.00 8.50 1.00 0.00 3.00 8.08 3.00 45.58 3.67 0.003.00 1.00 10.00 1.75 5.00 35.32 IA09 1.00 42.23 0.00 0.00 3.830.00 7.75 1.00 3.08 1.00 5.00 1.75 5.00 2.00 1.009.33 1.00 6.33 0.00 2.83 6.83 30.33 AB03 40.33 0.00 1.003.25 1.00 0.00 4.00 0.00 6.92 1.00 7.42 1.00 5.80 1.25 5.80 3.67 0.807.33 1.00 2.00 7.17 24.75 IA10 34.50 0.00 1.003.83 1.002.40 0.705.921.00 0.00 5.17 0.00 10.00 0.00 5.75 1.00 0.50 4.70 3.83 4.70 5.58 30.50 IA11 2.00 36.83 0.00 1.003.75 1.002.67 0.20 4.171.00 3.67 0.00 5.58 0.00 9.75 1.00 1.00 1.25 5.20 2.58 5.20 39.00 IA12 4.08 49.00 0.00 0.802.671.003.920.40 2.581.0010.00 0.00 3.17 0.00 6.33 1.00 0.00 1.83 4.20 1.67 4.20 35.83 AB04 9.58 3.75 43.67 0.00 1.00 2.501.003.170.00 1.00 5.58 0.00 5.83 1.00 10.00 1.00 1.00 1.25 6.00 2.83 6.00 26.58 AA03 6.00 4.33 36.58 4.670.30 0.00 1.00 6.33 0.00 3.50 0.00 7.83 1.00 1.00 1.67 4.30 2.08 4.30 AB051.00 5.500.00 43.92 4.83 5.17 49.75 1.00AA041.00 7.081.00 3.670.17 1.00 4.75 0.00 3.25 0.00 10.00 1.00 1.00 1.83 6.17 4.17 6.17 46.27 5.33 2.33 56.27 1.00IA13 0.50 3.00 1.00 4.67 0.83 1.00 10.00 0.00 4.50 1.00 5.83 1.00 10.00 1.00 7.33 2.00 7.33 3.85 55.92 2.42 65.92 10.00 9.00 2.67 10.00 6.67 1.00IA141.00 1.00 1.00 2.05 1.00 6.95 0.00 1.00 1.00 10.00 1.00 8.00 8.00 3.67 50.33 54.75 10.00 3.17 1.00AB061.00 5.67 1.00 5.75 1.00 4.67 1.00 10.00 0.00 6.83 1.00 1.00 10.00 1.00 8.00 6.67 8.00 30.83 37.58 4.42 9.33 1.58 6.17 1.58 1.00AB07 1.00 8.75 1.00 0.17 3.75 1.00 8.42 0.00 5.25 0.00 1.00 1.00 6.17 22.83 5.33 6.75 6.08 1.835.0028.00 3.25 3.33 1.00 AA05 1.00 7.00 1.00 0.00 3.67 1.00 6.58 0.16 0.00 0.00 1.00 5.16 52.67 2.75 62.67 5.17 2.00 7.83 2.92 1.75 50.25 1.00 AA06 1.00 6.83 1.00 1.00 2.50 1.00 2.17 0.00 1.00 1.00 1.00 8.0056.75 5.08 8.00 10.00 3.67 4.50 4.33 2.00 39.02 1.00 AA07 1.00 3.67 1.00 0.67 5.08 1.00 10.00 0.00 0.00 1.00 6.6743.32 8.50 6.67 6.50 1.00 8.83 4.00 2.93 2.10 9.50 46.55 1.00 IA15 1.00 9.75 0.80 7.42 0.33 4.33 1.00 0.16 0.00 1.00 6.13 50.53 9.92 6.29 4.30 1.00 3.12 4.03 10.00 4.13 9.20 33.08 1.00 IA16 1.00 1.00 3.90 1.00 1.00 0.00 0.00 1.00 7.00 38.17 9.77 7.00 4.38 3.98 1.00 5.58 4.72 2.55 1.92 9.82 44.42 6.58 6.50 1.00 IA01 1.00 1.00 5.13 0.50 1.00 0.00 0.00 1.00 6.50 50.33 4.58 5.08 1.00 1.92 3.83 5.28 3.58 3.50 33.83 9.42 6.83 1.00 IA18 1.00 1.00 3.83 0.83 1.00 1.00 0.00 0.00 5.83 39.83 4.50 5.92 1.00 1.50 3.50 3.33 1.92 8.83 0.00 38.00 2.00 10.00 0.00 IA19 0.80 0.33 0.30 1.00 0.67 1.00 1.00 5.10 4.43 41.33 6.00 1.08 2.33 3.42 6.08 1.17 10.00 31.92 4.83 4.00 4.33 1.00 AB08 1.00 1.00 0.00 0.00 0.00 0.00 4.00 37.08 3.33 0.00 4.83 2.92 8.00 1.00 5.83 1.67 7.33 34.25 3.67 2.27 5.08 0.00 5.17 1.00 0.00 AB09 0.60 0.67 0.00 0.00 0.00 2.27 37.58 6.08 3.00 3.50 0.00 3.17 1.42 3.83 53.08 6.08 1.00 3.33 1.004.42 7.00 1.00 AA10 1.00 1.00 0.00 1.00 0.00 1.00 6.00 58.75 4.08 5.42 5.25 6.17 1.42 4.75 40.08 AA11 3.92 1.00 5.67 1.006.33 4.00 4.75 1.00 0.00 1.00 0.00 0.00 3.00 43.33 4.50 5.92 0.00 6.75 0.00 2.42 10.00 32.67 AA12 9.17 10.00 1.00 7.00 1.00 1.00 1.00 0.00 0.00 5.001.42 4.00 36.08 1.33 2.83 1.003.25 0.00 7.00 0.00 10.00 AA13 2.42 3.33 1.00 1.50 1.00 1.00 0.80 0.00 4.80 3.80 2.33 1.003.42 0.00 6.33 0.00 8.00 0.00 AA14 4.75 5.58 1.00 0.00 0.00 0.67 0.67 0.00 1.00 1.00 4.34 3.67 6.83 0.00 IA20 time, in minutes 1.00 1.00 1.00 0.50 1.00 1.00 subjects’ 0.00 1.00 1.00 7.50 6.50 task completion The A.4. 1.00 1.00 1.00 1.00 0.50Table 1.00 0.00 1.00 1.00 7.50 6.50 0.00 1.00 0.60 0.00 1.00 1.00 0.00 1.00 0.00 4.60 3.60 0.00 1.00 0.40 0.38 1.00 1.00 1.00 1.00 1.00 6.78 5.78 1.00 1.00 0.60 0.50 1.00 1.00 0.00 1.00 1.00 7.10 6.10 1.00 1.00 0.80 0.38 1.00 1.00 0.00 1.00 1.00 7.18 6.18 1.00 1.00 0.80 0.00 1.00 1.00 0.00 1.00 1.00 6.80 5.80 0.00 0.50 0.00 0.00 1.00 1.00 0.00 1.00 1.00 4.50 3.50 0.00 1.00 0.00 0.25 1.00 1.00 0.00 1.00 1.00 5.25 4.25 1.00 1.00 0.60 0.00 1.00 1.00 0.90 1.00 1.00 7.50 6.50 0.00 1.00 1.00 0.38 1.00 1.00 0.00 1.00 1.00 6.38 5.38 196 ent CodeCity Experim Participant: Table 195 CodeCity Experim 20 Block developer software engineer system analyst software architect project manager IT lead CTO consultant professor Ph.D. student Master student Dat 20 A.5 41 Subjects 21 25 industry academia IA01 A1 IA02 diffic IA03 ult IA04 diffic AB01 A2.1 simp ult AB02 inter trivia le IA05 medi trivia l AA01 ate inter A2.2 simp l AA02 simp medi inter inter le IA06 ate trivia le medi trivia medi IA07 trivia l ate inter ate inter l IA08 simp l simp medi diffic medi IA09 trivia le A3 simp 35 ate trivia le Cod trivia ult ate AB03 le trivia l trivia l Diffi diffic l IA10 e diffic IA0 simp l A4.1 culty simp l simp ult IA11 simp ult IA0 1 inter le simp Age simp le Leve simp le IA12 IA0 2 simp le trivia medi le trivia le l Per IA0 3 simp le AB04 30 trivia le ate simp l inter simp l A4.2 Task AB0 4 34 simp le AA03 Job simp l simp le inter medi AB0 1 inter le 42 impo trivia le AB05 inter le Pos simp le Dev simp medi ate IA0 2 37 simp medi ssible trivia l AA04 simp medi Dev elop itio simp le ate simp le AA0 5 21 ate simp le diffic simp l IA13 ate simp le B1.1 CTO elop simp le AA0 1 n inter le 21 simp le er impo ult inter le IA14 simp le Dev , diffic IA0 2 trivia le 29 trivia medi simp le inter ssible trivia medi Ma elop Dev er AB06 inter le IA0 6 ult 26 trivia l ate simp l simp le diffic medi ate Ma ster elop inter l AB07 IA0 7 inter medi 26 simp simp l simp le B1.2 simp le Con ster Stuer impo ult ate IA0 8 simp medi AA05 35 simp medi ate diffic le inter le simp le trivia Ph. sult Stu den er trivia le impo ssible AB0 9 ate inter le AA06 27 ate simp le diffic ult simp medi inter le Ph. D. l trivia l IA1 3 impo ssible 25 simp medi AA07 diffic le impo ult ate inter le Hea D. Stu ant den t inter medi simp IA1 0 simp l inter ssible 32 B2.1 ate inter le IA15 inter ult diffic ssible simp medi Sof d Stu den , Ph. t simp medi ate IA1 1 simp le diffic le 28 diffic medi trivia medi trivia IA16 simp medi impo ult ate Sof twa of den t D. simp le ate AB0 2 diffic le simp le 39 simp ult impo ult ate ate diffic l l ate diffic le Stu IA01 diffic ssible Dev twa re IT AA0 4 simp le t inter ult 38 simp le inter le diffic ssible OO inter ult simp denIA18 inter ult Stu elop re Eng inter ult AB0 3 B2.2 simp le 34 inter medi diffic le simp medi kno P le inter ult trivia le Pro den inter medi IA19 diffic medi Eng ine t AA0 5 simp medi 22 inter inter inter medi ate trivia ult ate simp le impo medi Con ject t me wle medi simp l inter medi ate advinter IA1 4 AB08 simp ult ate ate 22 diffic le medi nt ine er trivia medi ate simp l Time simp le Sen sult Ma anc dge diffic ssible ate trivia le IA1 3 simp medi ate exp inter AB09 22 inter le inter ult Lea er ate impo ate inter l simp le ert edmediablate Ma ior simp le diffic ult AB0 4 Pres trivia l 29 ateadvadv inter inter le AA10 inter medi inter medi der medi ate diffic ssible inter medi Ma ster Javant nag anc simp le inter le AB0 6 fair sure diffic ult 32 e simp l trivia medi adv ancsimp AA11 impo medi ate , Res inter medi ate inter ult inter medi ate Jav medi Ma ster Stu a , Sys er ed medi ate inter le AA0 7 inter fair amou 31 impo ult trivia le ate simp l inter ssible ate exp ancsimp diffic medi ate simp medi ate Ph. ster Stu den Arc tem adv medi a ed le ear AA12 inter medi ate AA0 5 simp medi ate 23 inter fair amou nt impo ssible Exp simp l diffic le AA13 exp ert inter diffic medi trivia ult ate Con D. ate hite adv simp le le ed ate che AA0 6 medi anc inter ate 23 simp le Stu medi ate Ma den t too amou nt inter impo ssible simp ult exp ert simp medi inter le AA14r erietrivia ult ate kno Sof sult Stu den trivia l trivia le IA1 7 anc ct nag 30 trivia medi ate inter le mediedate very much nt t diffic ssible simp le exp ert inter le simp medi IA20 advinter Ma twa ant den diffic diffic l IA1 5 nce l wle ed ate inter l ate 26 ate inter l simp medi er/ fair little inter ult ate advinter Ma ster re trivia le kno ert simp medi trivia le impo ult anc medi t t IA0 6 diffic ult 30 dgeate diffic medi trivia medi Lev ate Ana simp le not amou simp Ph. ster Stu Arc diffic medi inter l kno wle simp le simp l anc medi IA1 1 ssible ed abl diffic Ecl 40 inter ult ate begadvsimp inter ult ate ate trivia l el trivia le adv wle dge lyst ate le very so muchnt Ph. D. diffic ult ate anc ed medi IA1 7 inter le 39 Stu den hite ult ips inter medi e diffickno le inter medi simp l kno ate trivia l exp inter Ph. D. Stu anc dge inter le not little inn ed AB0 9 inter ult simp medi abl medi 30 ct simpknoult wle e inter medi ate diffic medi ate exp wle simp le adv ert Pro D. Stu den den t inter l ed trivia fair so much AB0 8 interermedi inter medi abl e 27 ate simp le interbegle wle dge trivia le medi ate t impo kno ert dge simp ateexp anc inter ult ate l Stu Sof den medi ject inter AA1 9 kno t medi ate simp e inn dge abl 39 not amou simp medi ate simp le medi impo exp wle simp ssible ate le diffic medi kno ertintered le Dev twa Ma den t simp l AA1 0 adv wle inter medi ate abl 21 ate very so muchnt ate simp le trivia le e le ate abl trivia adv ert dge inter anc ssible er diffic le adv wle Sof elop re nag t diffic ult ate AA1 1 adv e Rev 23 inter medi ate medi not little simp le e trivia l adv anc interancmedi dge beg anc trivia Con twa Arc simp ult AA1 2 beg diffic ult abl 24 dge l ed abl trivia l beg .En inter medi ate ate simp very so much adv anc ed simp le diffic l kno inn ed Ma sult re er simpinnmedi hite er AA1 3 kno inter le e diffic ult 23 ed ate e simp l kno inn g. simp le abl inter medi ate le adv anc ed not little inter ult adv wle er Ph. ster ant Eng IA2 4 knodiffic inter medi 52 wle le er ate ct simp ult e trivia lekno wle er simp le inter medi ate exp anc ed fair so much beg anc dge Ph. D. adv impo inter medi 0 28 ine wle ult inter medi ate Stu dge simp le wle dge simp beg l adv ert ed simp medi ate exp inn ed abl Ph. D. Stu den, Pro er none amou trivia le beg inter inter medi ate 24 anc ssible beg dge abl inn dge abl impo medi ate le impo Tab le simp adv ate le trivia Stu adv Pro anc D. ject den nt very ert er simp l 36 inter medi ate medi abl e simp beg e le kno inn ed inn er OO diffic ssible ate diffic ssible adv anc ed abl e le adv anc Ph. fess Stu den t t inter l non not little kno leinn er trivia Ma inter medi ate ate e A.5wle er 7–1 P simp ult adv anc ed e inter ult adv anc ed Ma D. or den t trivia simp medi beg e not so much beg lewle er nag simp l dge inter medi ate 7–1 0 diffic le exp anc ed adv anc ed Dev ster Stu Nu inter medi trivia kno inn . The l ate t trivia le kno inn fair so much er, inter le >1 0 dge simp medi ate abl adv ert ed kno anc ed interJavult mb diffic medi ate beg wle er elop Stu den l er inter l none amou Arc 7–1 a esub adv wle ate exp anc impo le abl simp medi7–1 0 kno wle ed er simp medi simp medi adv inn dge inter ult ate kno anc dge er den t nt hite fair ject adv ert ed etrivia le 4–6ate0 inter ssible adv wle dge adv anc er simp4–6le 0 ateEcl oftrivia le diffic medi beg wle ed abl trivia ate t abl fair amou kno anc Yea ct l 1–3 1–3le adv anc dge abl simp medi adv anc ed simp adv inns’ dge ips inter ult ate 7–1 7–1 per e amou inter l kno wle ed e nt too adv anc ed abl e rs l ate kno anc ed simp le beg anc er le simp e medi impo medi 4–6 >1 0 Ta exp wle dge simp ablceiv 4–6 0 kno anc ed fair much nt kno wle ed inter le kno inn ed e diffic le diffic ssible ate 1–3inter le adv ert dge abl ble e ed 4–6 0 ate 1–3 adv wle ed kno wle dge very amou adv wle er simp medi 7–1trivia ult 1–3simp mediRev >1tim adv anc diffic ult exp anc dge kno wle dge none little nt .En abl e adv anc dge ate simp le A. 7–1 0 e 4–6 >1 0 l adv anc ed diffic ult adv ert ed abl ate g. inter le adv wle dge ablable e kno anc ed abl fair pre 1. 4–6 simp le adv anc ed 1–3 1–3 0 adv anc 4–6 0 impo ult adv anc dge beg wle ed fair amou e 7–1 1–3 trivia le >1 ssu inter medi exp anc ed e 4–6 Th adv anc ed adv anc ed ablable e inter ssible beg inn dge 7–1 0 not amou nt re 4–6 trivia l >1 adv ert ed inter medi ate exp anc ed 4–6 0 e adv anc ed diffic medi beg inn er and l 1–3 e fair so muchnt exp anc >1 abl 1–3 0 simp medi ate 4–6 0 adv ert ed su kno anc ed exp inn er impo ult ate 7–1 7–1 0 1–3 task adv ert ed e fair amou adv anc ate bje <1 diffic le adv wle ed beg ert er impo ssible 4–6 >1 0 anc 1–3 0 not amou nt adv anc ed <1 kno anc dge trivia ult beg inn diffi cts 7–1 diffic ssible 1–3 4–6 adv anc ed 4–6 0 ed fair so muchnt kno wle ed abl l beg inn er 7–1 0 7–1 1–3 inter ult ’ pe adv anc ed <1 cul very amou kno wle dge kno inn er e ty >1 0 impo medi 1–3 <1 adv anc ed 4–6 0 kno wle dge not little nt exp wle er 7–1 rso 1–3 1–3 0 diffic ssible ate exp anc ed 4–6 adv wle dge ablable 7–1 kno ert dge not so much 7–1 4–6 0 na 4–6 exp ert ed ult adv anc dge adv wle not so much 7–1 0 1–3 0 abl 1–3 l inf <1 ert adv anc ed ablable e kno anc dge >1 not so much e 4–6 4–6 4–6 0 adv anc ed beg wle ed abl 1–3 0 e not so much or 1–3 4–6 4–6 kno anc ed beg inn dge 7–1 7–1 so 4–6 ma e 4–6 kno wle ed much adv inn er 7–1 7–1 0 1–3 abl 1–3 0 exp wle dge tio beg anc er 7–1 0 1–3 >1 0 e 1–3 adv ert dge abl kno inn ed n, 7–1 1–3 >1 0 <1 0 anc adv wle er 7–1 0 abl e 1–3 clu 4–6 0 <1 kno anc dge ed 7–1 0 7–1 e 1–3 1–3 ste adv wle ed abl 1–3 >1 0 4–6 0 7–1 kno anc dge 7–1 <1 red >1 e 4–6 0 kno wle ed abl 7–1 4–6 0 1–3 0 1–3 0 by beg wle dge 1–3 e 4–6 0 4–6 <1 7–1 inn dge abl 7–1 tre 4–6 <1 er 4–6 0 abl e 4–6 atm 1–3 0 4–6 4–6 >1 e 4–6 1–3 en 4–6 1–3 4–6 4–6 0 7–1 t co 4–6 4–6 4–6 1–3 0 >1 >1 1–3 mb 1–3 0 1–3 4–6 0 4–6 4–6 ina <1 7–1 4–6 4–6 tio 4–6 0 <1 4–6 ns 1–3 4–6 1–3 7–1 1–3 0 >1 1–3 0 1–3 1–3 50 Code IA01 IA02 IA03 IA04 AB01 AB02 IA05 AA01 AA02 IA06 IA07 IA08 IA09 AB03 IA10 IA11 IA12 AB04 AA03 AB05 AA04 IA13 IA14 AB06 AB07 AA05 AA06 AA07 IA15 IA16 IA01 IA18 IA19 AB08 AB09 AA10 AA11 AA12 AA13 AA14 IA20 2 55 ia ing Criter ience Block Exper ced round advan Backg ced try advan ment m size indus ced Treat Syste try advan indus ced large Tool try advan er ity indus ner large Numb CodeC try begin Code 1 ity indus ner large CodeC my begin 1 ity acade ced IA01 large CodeC my advan 1 ity acade ced IA02 large CodeC try advan 1 ity indus ced IA03 large CodeC my advan 1 ity acade ced IA04 large CodeC my advan 1 ity acade ced AB01 large CodeC try advan 1 ity indus ced AB02 large CodeC try advan 1 ity indus ced IA05 large CodeC try advan 1 ity m indus AA01 ner mediu CodeC try begin 1 m ity indus ced AA02 mediu CodeC my advan 2 m ity acade ced IA06 mediu CodeC try advan 2 m ity indus ced IA07 mediu CodeC try advan 2 m ity indus ner IA08 mediu CodeC try begin 2 m ity indus ced IA09 mediu CodeC my advan 2 m ity acade ner AB03 mediu CodeC my begin 2 m ity acade ced IA10 mediu CodeC my advan 2 m ity acade ced IA11 mediu CodeC my advan 2 m ity acade ced IA12 mediu CodeC try advan 2 m ity indus AB04 ner mediu CodeC try begin 2 m ity indus AA03 ner mediu CodeC my begin 2 ity acade ced AB05 large CodeC my advan 2 xcl acade ced AA04 large Ecl+E my advan 3 xcl acade ced IA13 large Ecl+E my advan 3 xcl acade ced IA14 large Ecl+E my advan 3 xcl acade ced AB06 large Ecl+E try advan 3 xcl indus ced AB07 large Ecl+E try advan 3 xcl indus ced AA05 large Ecl+E try advan 3 xcl indus ced AA06 large Ecl+E try advan 3 m xcl indus AA07 ner mediu Ecl+E try begin 3 m xcl indus IA15 ner mediu Ecl+E my begin 4 m xcl acade ced IA16 mediu Ecl+E my advan 4 m xcl acade ced IA01 mediu Ecl+E my advan 4 m xcl acade ced IA18 mediu Ecl+E my advan 4 m xcl acade ced IA19 mediu Ecl+E my advan 4 m xcl acade ced AB08 mediu Ecl+E my advan 4 m xcl acade ced AB09 mediu Ecl+E my advan 4 m xcl acade AA10 mediu Ecl+E try 4 m xcl indus AA11 blocks mediu Ecl+E 4 m xcl ts and AA12 mediu Ecl+E 4 xcl treatmen AA13 Ecl+E cts to 4 AA14 the subje of IA20 nment 19 Subjects 193 Table A.3. The correctness of the subjects’ solutions to the tasks Pre-experiment questionnaire Experiment handouts Task solutions & grading schemes Data set Page 22 Correctness Results 24 % .26 more correct with CodeCity Completion Time CodeCity CodeCity Eclipse+Excel Eclipse+Excel 12 % 8 7 .01 6 60 50 faster with CodeCity 5 40 4 30 ANOVA (two-way analysis of variance) statistically significant 95% confidence interval large effect size (d=0.89) CodeCity vs Eclipse ! +24% correctness -12% completion time 3 2 medium large Statistically Significant!! ANOVA (two-way analysis of variance) statistically significant 95% confidence interval moderate effect size (d=0.63) 20 medium large Statistically Significant.. Statistically Significant (who cares?) Richard Wettel, Michele Lanza, Romain Robbes Software Systems as Cities: A Controlled Experiment Richard Wettel and Michele Lanza ABSTRACT Software Systems as Cities: A Controlled Experiment Software visualization is a popular program comprehension technique used in the context of software maintenance, reverse engineering, and software evolution analysis. While there is a broad range of software visualization approaches, only few have been empirically evaluated. This is detrimental to the acceptance of software visualization in both the academic and the industrial world. We present a controlled experiment for the empirical evaluation of a 3D software visualization approach based on a city metaphor and implemented in a tool called CodeCity. The goal is to provide experimental evidence of the viability of our approach in the context of program comprehension by having subjects perform tasks related to program comprehension. We designed our experiment based on lessons extracted from the current body of research. We conducted the experiment in four locations across three countries, involving 41 participants from both academia and industry. The experiment shows that CodeCity leads to a statistically significant increase in terms of task correctness and decrease in task completion time. We detail the experiment we performed, discuss its results and reflect on the many lessons learned. ! ! Categories and Subject Descriptors D.2.7 [Distribution, Maintenance and Enhancement]: Restructuring, reverse engineering, and reengineering In Proceedings of ICSE 2011 (33rd International Conference on Software Engineering) pp. 551 - 560. IEEE CS Press, 2011 General Terms Experimentation, Human Factors, Measurement Keywords 1. INTRODUCTION Software visualization is defined by Stasko et al. as “The use of the crafts of typography, graphic design, animation, and cinematography with modern human-computer interaction and computer graphics technology to facilitate both the human understanding and effective use of computer software” [13]. Richard Wettel and Michele Lanza Categories and Subject Descriptors D.2.7 [Distribution, Maintenance and Enhancement]: Restructuring, reverse engineering, and reengineering General Terms Experimentation, Human Factors, Measurement Keywords 1. 1. we detail the experiment design and its operation, reporting on a number of lessons learned regarding the many pitfalls that this type of experiment entails, and INTRODUCTION Permission to make digital or hard copies of all or part of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. To copy otherwise, to republish, to post on servers or to redistribute to lists, requires prior specific permission and/or a fee. ICSE ’11, May 21–28 2011, Honolulu, Hawaii Copyright 2011 ACM 978-1-4503-0445-0/11/05 ...$10.00. 3. PLEIAD @ DCC University of Chile [email protected] It has earned a reputation as an effective program comprehension technique, and is widely used in the context of software maintenance, reverse engineering and reengineering [2]. The last decade has witnessed a proliferation of software visualization approaches, aimed at supporting a broad range of software engineering activities. As a consequence, there is a growing need for objective assessments of visualization approaches to demonstrate their effectiveness. Unfortunately, only few software visualization approaches have been empirically validated so far, which is detrimental to the development of the field [2]. One of the reasons behind the shortage of empirical evaluation lies in the difficulty of performing a controlled experiment for software visualization. On the one hand, the variety of software visualization approaches makes it almost impossible to reuse the design of an experiment from the current body of research. On the other hand, organizing and conducting a proper controlled experiment is, in itself, a difficult endeavor, which can fail in many ways: data which does not support a true hypothesis, conclusions which are not statistically significant due to insufficient data points, etc. We present a controlled experiment for the empirical evaluation of a software visualization approach based on a city metaphor [16]. The aim is to show that our approach, implemented in a tool called CodeCity, is at least as effective and efficient as the state of the practice at supporting reverse engineering and program comprehension activities. We conceived a set of tasks and measured both the correctness of the task solutions and the task completion time. We conducted the experiment in four locations across three countries, with participants from both industry and academia, with a broad range of experience. In this paper we make the following two major contributions: Software visualization, Empirical validation Software visualization is defined by Stasko et al. as “The use of the crafts of typography, graphic design, animation, and cinematography with modern human-computer interaction and computer graphics technology to facilitate both the human understanding and effective use of computer software” [13]. CODECITY IN A NUTSHELL Our approach [16, 15, 19] uses a city metaphor to depict software systems as three-dimensional cities (see Figure 1), where classes are buildings and packages are districts. Romain Robbes REVEAL @ Faculty of Informatics University of Lugano {richard.wettel,michele.lanza}@usi.ch 2. we discuss the results of the experiment, which show that our approach is a viable alternative to existing non-visual techniques. Structure of the paper. In Section 2 we describe our software visualization approach. In Section 3 we present the related work, followed by a list of desiderata in Section 4, extracted from an extensive study of the existing body of research. In Section 5 we describe the design of our experiment and in Section 6 we detail the experiment’s operational phase. In Section 7 we present how we collected the data on which we performed the analysis presented in Section 8. In Section 9 we present the results of the experiment, followed by a presentation of the threats to validity in Section 10, and the conclusions in Section 11. Figure 1: Representation of a software system in CodeCity Similar to the 2D polymetric view approach [4], we map a set of software metrics on the visual properties of artifacts: The Number Of Methods is mapped on the height of the buildings, the Number Of Attributes on the base size, and the number of Lines Of Code on the color of the buildings, from dark gray (low) to intense blue (high). We also extended our approach to address the visualization of design problems [19]: Inspired by disease maps we assign vivid colors to design problems and color the affected artifacts according to the design problems that characterize them. The resulting visualization is called disharmony map (see Figure 2) and relies on design problem data computed using Marinescu’s detection strategies [7]. Component java.awt String Calendar NOM: 280 NOA: 88 java.lang NOM: 81 NOA: 7 RELATED WORK 4. There is a rich body of research on empirical evaluation of information visualization by means of controlled experiments. To identify both good practices and commonly occurring mistakes, we conducted an extensive study of the literature. The study [20] extends far beyond the scope of this paper; we limit the discussion to controlled experiments for the evaluation of software visualization. Storey and Müller performed a controlled experiment [14] to compare the support of SHriMP and Rigi in solving a number of program comprehension tasks, both to each other and to a baseline (i.e., SNIFF+). Marcus et al. performed a study to test the support provided by the sv3D visualization tool in answering a set of program comprehension questions [6]. The authors compared the performances obtained by using sv3D to the ones obtained by exploring the source code in an IDE and a text file containing metrics values. Lange et al. performed a controlled experiment in which they evaluated the usefulness of their enriched UML views, by comparing them with traditional UML diagrams [3]. The baseline of the experiment was composed of a UML tool and a metric analysis tool. Quante performed a controlled experiment for the evaluation of dynamic object process graphs in supporting program understanding [10], using not one, but two subject systems. A statistically significant improvement of the experimental group could only be detected in the case of one of the systems, which shows that relying on solely one system is unsound. Cornelissen et al. [1] performed a controlled experiment for the evaluation of EXTRAVIS, an execution trace visualization tool. The purpose of the experiment was to evaluate how the availability of EXTRAVIS influences the correctness and the time spent by the participants in solving a number of program comprehension tasks. From the perspective of the experiment’s design, there are two major differences between our experiment and the related work, summarized in Table 1. Subjects Academia Industry Storey et al. [14] 30 Marcus et al. [6] 24 Lange et al. [3] 100 Quante [10] InputEvent java.awt.event NOM: 14 NOA: 21 0 0 0 25 0 http://codecity.inf.usi.ch 17 27 38 39 475 1,725 2 8. Choose real-world systems. Many experiments in the literature use small systems, making it hard to generalize the results for real-world systems. 42 ? ? 20 1 310 57 21 20 1,320 4,656 93 454 Q4 : Do the potential benefits of using CodeCity in terms of correctness and time depend on the user’s background (i.e., academic versus industry practitioner)? 7. Limit the time allowed for solving each task. Allowing unbounded time for a task to avoid time pressure may lead participants to spend the entire time allotted for the experiment on solving a single task, as Quante’s experiment [10] showed. 43 160 Cornelissen et al. [1] 9. Include more than one subject system in the experimental design. Quante [10] showed that performing the same experiment on two different systems can lead to significantly different results. 10. Provide the same data to all participants. The observed effect of the experiment is more likely attributable to the independent variables if this guideline is followed. Table 1: Comparing to the related work 5.2.1 Q3 : Which are the task types for which using CodeCity over nonvisual exploration tools makes a difference in either correctness or completion time? 6. Include tasks which may not advantage the tool being evaluated. This allows the experimenters to actually learn something during the experiment, including shortcomings of their approach. 11. Report results on individual tasks. This allows for a more precise and in-depth analysis of the strengths and weaknesses of an approach. The first is that in our experiment, we have an equal split between subjects from academia (i.e., 21) and industry (i.e., 20), while the rest of the experiments have only subjects from academia, with the exception of Cornelissen et al., who had one single industry practitioner. While students are convenient subjects, a sample made up entirely of students might not adequately represent the intended user population [9]. The second issue regards the fact that the systems used by most of the experiments are rather small and, thus, not representative for a realistic work setting. In our experiment the largest of the two systems has almost half a million lines of code. Moreover, learning from Quante’s experiment we do not rely on one system only, but on two. Research Questions & Hypotheses The research questions underlying our experiment are: Q5 : Do the potential benefits of using CodeCity in terms of correctness and time depend on the user’s experience level (i.e., novice versus advanced)? The null hypotheses and alternative hypotheses corresponding to the first two research questions are synthesized in Table 2. 12. Provide all the details needed to make the experiment replicable. Di Penta et al. discuss the benefits of replicability and present a set of guidelines that enable the replication of the experiment [9]. 5. EXPERIMENTAL DESIGN The purpose of our experiment is to quantitatively evaluate the effectiveness and efficiency of our approach when compared to state-of-the-practice exploration approaches. Null hypothesis Alternative hypothesis H10 : The tool does not impact the correctness of the solutions to program comprehension tasks. H1 : The tool impacts the correctness of the solutions to program comprehension tasks. H20 : The tool does not impact the time required to complete program comprehension tasks. H2 : The tool impacts the time required to complete program comprehension tasks. Table 2: Null and alternative hypotheses For the third question, we perform a separate analysis of correctness and completion time for each of the tasks. For the last two questions we analyze the data within blocks, which we however do not discuss in this paper due to space reasons. We refer the interested reader to our detailed technical report [20]. 5.2 5.2.2 7.4 Participants’ Feedback The questionnaire handout ends with a debriefing section, in which the participants are asked to assess the level of difficulty for each task and the overall time pressure, to give us feedback which could potentially help us improve the experiment, and optionally, to share with us any interesting insights they encountered during the analysis. System size Medium Large Name Lines of code Packages Classes God classes Brain classes Data classes A4.1 1 11 5 4 0 A4.2 2 7 11 1 0 B1.1 B1.2 0 0 3 8 10 2 2 6 9 2 B2.1 4 11 5 1 0 B2.2 8 9 3 1 0 1 4 6 8 2 Dep. var. System size Tool Medium Ecl+Exl CodeCity mean difference min max median stdev 3.50 6.50 5.800 1.147 5.462 Difficulty A1 A2.1 A2.2 tool = Ecl+Excl 7 trivial 7 5 simple 7 10 12 intermediate 3 4 3 difficult 4 0 1 impossible 0 A2.1 0 A2.2 0 Difficulty A1 trivial 7 7 5 simple 7 10 12 intermediate 3 4 3 tool = CodeCity difficult 4 0 1 impossible 0 0 0 Ecl&Exl trivial 3 0 A3 simple 12 9 6 3 8 2 0 8 0 0 A1 A2.1 tool = Ecl+Excl trivial Legend: 12 9 tool = Ecl+Excl 1 11 5 4 0 A3 simple A4.1 A321 A4.2 9 A4.1 7 7 5 4 A3 3 0 0 A4.1 B1.2 0 B2.2 8 9 1 4 6 8 2 1 4 6 2 B2.2 B2.161 8 0 0 B1.2 difficult 1 0 B2.2 impossible8 B2.1 B2.2 impossible B2.1 B1.265 5 4 2 B1.1 B2.1 4 11 5 1 0 B2.1 8 9 3 1 0 3 impossible 4 11 5 1 difficult B1.2 1 B1.1 7 4 2 0 A4.2 9 2 B1.2 B1.2 difficult B1.1 4 A4.2 9 Task Task intermediate simple 2 2 6 difficult intermediate A4.1 A4.2 B1.1 simple intermediate Task A3 10 0 0 A2.2 A4.1 A4.2 B1.1 1 2 0 11 7 0 5 11 3 4 1 8 0 A4.1 0 A4.2 10 B1.1 2 0 2 7 0 2 11 3 6 1 8 9 0 10 2 intermediate A3 trivial 0 12 A2.2 = CodeCity A2.1 9tool A1 trivial CodeCity 6 12 3 9 0 A1 A2.1 A2.2 1 1 A163 A2.1 A2.2 6 10 8 Difficulty trivial simple intermediate difficult impossible B2.200 10 1 1 B2.1 6 8 5 B2.2 impossible 6 simple difficult impossible Figure 4:trivial Histograms of intermediate perceived difficulty per task 12 3 9 0 A1 A2.1 A3 A4.1 A4.2 B1.1 B1.2 B2.1 B2.2 6 The reason is that weA2.2 underestimated the knowledge of CodeCity 3 Task required0 to perform this task. Solving this task with our toolB2.2implies A1 A2.1 A2.2 A3 A4.1 A4.2 B1.1 B1.2 B2.1 a deep knowledge of CodeCity’s customization features, and of the Task underlying Smalltalk programming language: the only subject who managed to solve the task in the experimental group is a Smalltalk expert. These were unreasonable requirements to expect from the experimental subjects, who were not trained to use such features. To eliminate this unusually large discrepancy between the two groups, we excluded the task from the analysis. Difficulty trivial simple intermediate difficult impossible Difficulty trivial simple intermediate difficult impossible 8.2 A1 A1 3 6 8 2 0 A2.1 A2.2 3 1 1 6 10 8 8 8 10 2 0 0 0 A2.1 0 A2.2 0 1 1 10 8 8 10 0 0 0 0 A3 A3 1 2 7 5 4 A4.1 1 2 7 5 4 A4.1 9 7 3 0 0 A4.2 9 7 3 0 0 A4.2 4 9 4 2 0 B1.1 4 9 4 2 0 B1.1 1 7 5 4 2 B1.2 1 7 5 4 2 B1.2 5 6 8 0 0 B2.1 B2.2 5 1 6 6 8 10 0 1 0 B2.1 1 B2.2 1 0 6 0 10 6 1 8 1 5 0 0 6 8 5 3.896 2.27 6.00 3.900 1.085 Overall Ecl+Exl CodeCity 5.050 +29.62% 3.00 6.30 5.100 1.031 4.803 2.27 6.50 4.430 1.349 5.968 +24.26% 3.00 8.00 6.065 1.294 Medium Ecl+Exl CodeCity 38.809 31.92 53.08 38.000 6.789 33.178 -14.51% 24.67 39.50 35.575 5.545 8.3 Completion Time (minutes) Large Overall Ecl+Exl CodeCity Ecl+Exl CodeCity 44.128 22.83 55.92 48.260 11.483 39.644 -10.16% 27.08 48.55 40.610 6.963 41.048 22.83 55.92 40.080 9.174 Subject Analysis 9.1 After the pilot study involving nine participants, we conducted the experiment with a total of 45 participants in several runs. After removing four data points during the outlier analysis, we were left with 41 subjects, of which 20 industry practitioners (all advanced), and 21 from academia (of which 9 beginners and 12 advanced). For each of the 4 treatments, we have 8–12 data points and a fair subject distribution within the remaining three blocks, as shown in Table 6. 36.117 -12.01% 24.67 48.55 36.125 6.910 Table 7: Descriptive statistics related to correctness and completion time T1 50.00 60.00 44.13 Outlier Analysis Before performing our statistical test, we followed the suggestion of Wohlin et al. [21] regarding the removal of outliers caused by exceptional conditions, to allow us to draw valid conclusions from the data. During the Bologna I experimental run, one of the participants assigned with an experimental treatment experienced serious performance slowdowns due to the low performance of his computer. Although this participant was not the only one reporting performance slowdowns, he was by far the slowest as measured by the completion time and, since this represented an exceptional condition, we excluded his data from the analysis. Another participant got assigned to an Ecl+Exl treatment, although he did not have any experience with Eclipse, but with another IDE. For this reason, this subject took more time in the first tasks than the others, because of his lack of experience with Eclipse. Since we did not want to compromise the analysis by disfavoring any of the groups (i.e., this data point provided the highest completion time and would have biased the analysis by disadvantaging the control groups), we excluded also this data point from the analysis. During the Bologna II run, two participants had compatibility problems with the virtualization software installed on their machines. Eventually they were borrowed our two replacement machines, but due to the meeting room’s tight schedule, we were not able to wait for them to finish the experiment. We decided to exclude these two data points from our analysis. Correctness (points) Large Ecl+Exl CodeCity 6.733 +23.27% 5.00 8.00 6.585 .959 The effect of tool on completion time. There was a significant main effect of the tool on the completion time F (1, 37) = 4.392, p = .043, indicating that the mean completion time was significantly lower for CodeCity users than for Ecl+Exl users. Overall, there was a decrease in completion time of 12.01% for CodeCity users (M = 36.117, SD = 6.910) over Ecl+Exl users (M = 41.048, SD = 9.174). In the case of the medium size system, there was a 14.51% decrease in completion time of CodeCity users (M = 33.178, SD = 5.545) over Ecl+Exl users (M = 38.809, SD = 6.789), while in the case of the large size system, there is a 10.16% decrease in completion time for CodeCity users (M = 39.644, SD = 6.963) over Ecl+Exl users (M = 44.128, SD = 11.483). The data shows that the time decrease for CodeCity users over Ecl+Exl users is slightly lower in the case of the large system compared to that obtained for the medium system. The effect of system size on completion time. While not the goal of the experiment, a significant effect of system size on the completion time was observed, F (1, 37) = 5.962, p = .020, indicating that completion time was significantly lower for users analyzing the medium system than for users analyzing the large system. The main effect of both tool and object system size on completion time and the lack of the effect of interaction between tool and object system size on completion time, are illustrated in Figure 7, as well as the completion time box plots for the four treatments. 40.00 39.64 30.00 Tool Ecl+Exl CodeCity 38.81 33.18 20.00 40.00 30.00 Tool 10.00 50.00 Ecl+Exl CodeCity 0.00 20.00 Medium Large Object system size Medium Large Object system size EXAMINE VARIABLES=correctness BY Syssize /PLOT=BOXPLOT /STATISTICS=NONE /NOTOTAL /ID=Tool. Figure 7: Means and box plots for completion time 9.3 Result Summary EXAMINE VARIABLES=correctness BY Syssize BY Tool /PLOT=BOXPLOT /STATISTICS=NONE /NOTOTAL. Correctness. The data allows us to reject the first null hypothesis Explore H10 in favor of the alternative hypothesis H1, which states that the tool impacts the correctness of the solutions to program comprehension tasks. Overall, CodeCity enabled an increase in correctness of 24.26% over Ecl+Exl. This result is statistically significant. Completion time. We can also reject the second null hypothesis H20 in favor of the alternative hypothesis H2, which states that the tool impacts the time required to complete program comprehension tasks. Overall, CodeCity enabled a completion time reduction of 12.01% over Ecl+Exl. This result is statistically significant. Page 5 9.4 Task Analysis A secondary goal of our experiment was to identify the types of tasks for which CodeCity provides an advantage over the baseline. To this end, for each task described in Section 5.5 we compared the performances—in terms of correctness and time—of the two tools and reasoned about the potential causes behind the differences. See Figure 8 for a graphical overview supporting our task analysis5 . A1 - Identifying the convention used to organize unit tests relatively to the tested classes. While Eclipse performed consistently, CodeCity outperformed it on the medium system and underperformed it on the large system. The difference in performance is partially owed to fact that, in spite of the existence of a number of classes named *Test, there are no unit tests in the large system. Only a small number of CodeCity users examined the inheritance relations, while the majority relied only on name matching. The time is slightly better for the CodeCity subjects, who benefited from the visual overview of the system, while Eclipse required scrolling up and down through the package structure. A2.1 - Determining the spread of a term among the classes. CodeCity performed marginally better than Eclipse in both correctness and completion time. In CodeCity, once the term search is completed, the spread can be visually assessed. In Eclipse, the term search produces a list of class names. For this task the list showed classes in many packages belonging to different hierarchies and, therefore, a dispersed spread may represent a safe guess. A2.2 - Determining the spread of a term among the classes. Although the task is similar to the previous, the results in correctness are quite different: CodeCity significantly outperformed Eclipse by 29–38%. The list of classes and packages in Eclipse, without the context provided by an overview (i.e., How many other packages are there in the system?) deceived some of the control subjects into believing that the spread of the term was dispersed, while the CodeCity users took advantage of the “big picture” and better identified the localized spread of this term. A3 - Estimating impact. CodeCity significantly outperformed Eclipse in correctness by 40–50%, and was slightly faster than Eclipse. Finding the caller classes of a given class in Eclipse is not straightforward. Moreover, the resulting list provides no overview. A4.1 - Identifying the classes with the highest number of methods. In terms of correctness, CodeCity was on a par with Excel for the medium system and slightly better for the large system. In terms of completion time, Excel was slightly faster than CodeCity. While CodeCity is faster at building an approximate overview of systems, a spreadsheet is faster at finding precise answers in large data sets. B1.1 - Identifying the package with the highest percentage of god classes. In both correctness and completion time, CodeCity slightly outperformed Excel. The low scores in correctness of both tools shows that none of them is good enough to solve this problem alone. Instead, CodeCity’s visual presentation and Excel ’s precision could complement each other well. Page 9 5 Due to space concerns, we do not discuss task B2.2 tool = CodeCity 15 Block Experience Level CodeCity none beginner knowledgeable advanced expert OOP Experience Level Eclipse+Excel none beginner knowledgeable advanced expert OOP Experience Level CodeCity none beginner knowledgeable advanced expert OOP Java 0 0 5 9 8 Eclipse 0 3 4 12 3 1 5 10 6 0 Reverse Engineering 0 10 7 4 1 0 0 8 10 1 Reverse Engineering 0 7 7 4 1 1 5 10 6 0 Reverse Engineering 0 10 7 4 1 12 9 6 3 0 Academia Industry OOP Treatment T2 T3 Beginner Advanced Beginner Advanced 2 2 0 6 3 2 0 7 2 3 0 3 Total 10 12 8 Java Eclipse T4 Total 2 9 5 12 0none 0 4beginner 20 Legend: knowledgeable 11advanced 41 expert Reverse Engineering Expertise field Table 6: Subject distribution Moreover, the random assignments of treatment within blocks led to a balanced distribution of the subjects’ expertise among treatments, as we see in Figure 5. tool = Ecl+Excl Ecl&Exl Java 0 0 2 12 5 Eclipse 0 0 2 13 4 15 12 9 6 3 0 Legend: none beginner knowledgeable advanced expert OOP Java tool = CodeCity Eclipse 0 3 4 12 3 15 12 9 6 3 0 Eclipse Reverse Engineering Expertise field CodeCity Java 0 0 5 9 8 Legend: none beginner knowledgeable advanced expert OOP Java Eclipse Reverse Engineering Expertise field Figure 5: Subjects’ expertise tool = Ecl+Excl OOP Java 0 0 2 12 5 Eclipse 0 0 2 13 4 0 0 8 10 1 9. Reverse Engineering 0 7 7 4 1 12 9 6 3 0 RESULTS 7.00 6.00 none beginner knowledgeable advanced expert Based on the designJava of our experiment, i.e., a between-subjects, OOP Eclipse Reverse Engineering unbalanced (i.e., implying unequal sample sizes) design with two Expertise field independent variables (tool and system size), the suitable parametric test for hypothesis testing is a two-way Analysis Of Variance (ANOVA). We performed the test for correctness and completion time using the SPSS statistical package. Before the analysis, we ensured that our data met the test’s assumptions: 1. Independence of observations. This assumption is implicitly met through the choice of a between-subjects design. 2. Homogeneity of variances of the dependent variables. We tested our data for homogeneity of both correctness and completion time, using Levene’s test [5] and in both cases the assumption was met. 3. Normality of the dependent variable across levels of the independent variables. We tested the normality of correctness and completion time across the two levels of tool and object system size using the Shapiro-Wilk test for normality [11], and also this assumption was met in all the cases. We chose a significance level of .05 (↵ = .05), which corresponds to a 95% confidence interval. The statistics related to correctness and completion time are presented in Table 7. A1 A2.1 0.10 1.00 0.50 0.55 6.73 Ecl+Exl CodeCity 7.00 5.05 6.00 3.90 3.00 5.00 4.00 2.00 Tool Ecl+Exl CodeCity 1.00 3.00 2.00 0.00 Medium Medium Large Large Object system size Object system size EXAMINE VARIABLES=time BY Syssize BY Tool /PLOT=BOXPLOT Figure 6: Means and box/STATISTICS=NONE plots for correctness /NOTOTAL. Explore Analysis Results on Completion Time Interaction effect between tool and system size on completion time. Similarly, it is important that there is no interaction between the two factors, which could have affected the completion time. The interaction effect of tool and system size on completion time was not significant, F (1, 37) = .057, p = .813. The data shows no evidence that the variation in completion time between CodeCity and Ecl+Exl depends on the size of the system, which strengthens any observed effect of the tool factor on the completion time. Page 4 1.00 1.00 A3 0.75 0.98 0.46 0.62 A4.1 0.54 0.62 0.04 0.26 Page 7 A4.2 1.00 1.00 0.75 1.00 B1.1 0.00 0.11 0.67 1.00 B1.2 0.10 0.33 0.00 0.17 B2.1 0.90 0.83 0.75 1.00 0 1.00 1.00 0.60 1.00 0.83 0 A2.2 A3 A4.2 1.00 0.75 B1.2 B2.1 0.80 0.75 0.67 0.63 0.54 0.10 0.04 A2.2 A3 T2 CodeCity installation with a loaded model of the system to be analyzed, and the source code of the object system, accessible from the visualizations. Ecl+Exl T3 T4 Eclipse installation with default development tools, an Eclipse workspace containing a project with the entire source code of the object system, an Excel installation, and a sheet containing all the metrics and design problem data required for solving the tasks and available to the experimental groups. 0 A4.1 A4.2 0 B1.1 B1.2 B2.1 Treatment Description 5.5 Id Task Concern A1 Description. Locate all the unit test classes of the system and identify the convention (or lack therof) used by the developers to organize the tests. Rationale. Test classes are typically defined in packages according to a project-specific convention. Before integrating their work in the system, developers need to understand how the test classes are organized. Software architects design the high-level structure of the system (which may include the convention by which test classes are organized), while quality assurance engineers monitor the consistency of applying these rules in the system. Structural understanding A2.1 Description. Look for the term T1 in the names of classes and their attributes and methods, and describe the spread of these classes in the system. Rationale. Assessing how domain knowledge is encapsulated in source code is important in several scenarios. To understand a system they are not familiar with, developers often start by locating familiar domain concepts in the source code. Maintainers use concept location on terms extracted from change requests to identify where changes need to be performed in the system. Software architects want to maintain a consistent mapping between the static structure and the domain knowledge. Each of these tasks starts with locating a term or set of terms in the system and assess its dispersion. A2.2 Description. Look for the term T2 in the names of classes and their attributes and methods, and describe the spread of these classes in the system. Rationale. Same as for task A2.1. However, the term T2 was chosen such that it had a different type of spread than T1. A3 http://www.virtualbox.org A2.1 7.22 4.82 7.29 5.39 0 A2.2 5.49 3.53 6.51 4.34 A A3 2.95 4.76 3.56 5.54 A4.1 7.66 6.32 7.95 7.75 A4.2 5.63 4.41 5.14 4.30 B1.1 9.35 8.74 7.33 4.50 B1.2 6.86 4.94 7.13 6.23 Concept location Metricbased analysis A4.2 Description. Find the three classes with the highest average number of lines of code per method in the system. Rationale. It is difficult to prioritize candidates for refactoring from a list of large classes. In the absence of other criteria, the number and complexity of methods can be used as a measure of the amount of functionality for solving this problem related to maintenance and quality assurance. Metricbased analysis B1.1 Description. Identify the package with the highest percentage of god classes in the system. Rationale. God classes are classes that tend to incorporate an overly large amount of intelligence. Their size and complexity often make them a maintainer’s nightmare. Keeping these potentially problematic classes under control is important. By maintaining the ratio of god classes in packages to a minimum, the quality assurance engineer keeps this problem manageable. For a project manager, in the context of the software process, packages represent work units assigned to the developers. Assessing the magnitude of this problem allows him to take informed decisions in assigning resources. Focused design assessment B1.2 Description. Identify the god class containing the largest number of methods in the system. Rationale. It is difficult to prioritize candidates for refactoring from a list of god classes. In the absence of other criteria (e.g., the stability of a god class over its evolution), the number of methods can be used as a measure of the amount of functionality for solving this problem related to maintenance and quality assurance. Focused design assessment B2.1 Description. Identify the dominant class-level design problem (the design problem that affects the largest number of classes) in the system. Rationale. God class is only one of the design problems that can affect a class. A similar design problem is the brain class, which accumulates an excessive amount of intelligence, usually in the form of brain methods (i.e., methods that tend to centralize the intelligence of their containing class). Finally, data classes are just “dumb” data holders without complex functionality, but with other classes strongly relying on them. Gaining a “big picture” of the design problems in the system would benefit maintainers, quality assurance engineers, and project managers. Holistic design assessment B2.2 Description. Write an overview of the class-level design problems in the system. Describe your most interesting or unexpected observations. Rationale. The rationale and targeted user roles are the same as for task B2.1. However, while the previous one gives an overview of design problems in figures, this task provides qualitative details and has the potential to reveal the types of additional insights obtained with visualization over raw data. Holistic design assessment 6. OPERATION 2009 November December 18 24 25 2 9 21 2010 February January 28 5 8 14 28 18 .. April 22 24 25 Pilot 14 Experiment Lugano Tasks 10.00 Concept location Description. Evaluate the change impact of class C by considering its caller classes (classes invoking any of its methods). The assessment is done in terms of both intensity (number of potentially affected classes) and dispersion (how these classes are distributed in the package structure). Rationale. Impact analysis allows one to estimate how a change to a part of the system impacts the rest of the system. Although extensively used in maintenance activities, impact analysis may also be performed by developers when estimating the effort needed to perform a change. It also gives an idea of the quality of the system: A part of the system which requires a large effort to change may be a good candidate for refactoring. A4.1 Description. Find the three classes with the highest number of methods (NOM) in the system. Rationale. Classes in object-oriented systems ideally encapsulate one single responsibility. Since methods are the class’s unit of functionality, the number of methods metric is a measure of the amount of functionality of a class. Classes with an exceptionally large number of methods make good candidates for refactoring (e.g., split class), and therefore are of interest to practitioners involved in either maintenance activities or quality assurance. Table 5: Tasks Our approach provides aid in comprehension tasks supporting adaptive and perfective maintenance. We tried to use the previouslydefined maintenance task definition frameworks by Pacione et al. [8] and by Sillito et al. [12] to design the tasks of our evaluation. However, both frameworks proved ill-suited. Since CodeCity relies exclusively on static information extracted from the source code, it was not realistic to map our tasks over Pacione’s model, which is biased towards dynamic information visualization. Sillito’s wellknown set of questions asked by developers, although partially compatible with our tasks, refers to developers exploring source code only. Our approach supports software architects, designers, quality-assurance engineers, and project managers, in addition to developers. These additional roles assess systems at higher levels of abstraction not covered by Sillito’s framework. We decided to use these works as inspirations and defined our own set of tasks, detailed in Table 5, dealing with various maintenance concerns and split into program comprehension (A) and design quality assessment (B). A1 ...happily( ever(aſter 2. we discuss the results of the experiment, which show that our approach is a viable alternative to existing non-visual techniques. Structure of the paper. In Section 2 we describe our software visualization approach. In Section 3 we present the related work, followed by a list of desiderata in Section 4, extracted from an extensive study of the existing body of research. In Section 5 we describe the design of our experiment and in Section 6 we detail the experiment’s operational phase. In Section 7 we present how we collected the data on which we performed the analysis presented in Section 8. In Section 9 we present the results of the experiment, followed by a presentation of the threats to validity in Section 10, and the conclusions in Section 11. Table 4: Treatments We provided the treatments as virtual images for VirtualBox4 , which was the only piece of software required to be installed by the participants. Each virtual image contained only the necessary pieces of software (i.e., either CodeCity or Eclipse+Excel), installed on a Windows XP SP2 operating system. 2.00 B1.1 0.90 0.75 0.46 Findbugs T1 4.00 0.17 0.86 A2.1 Tool 8.00 A4.1 tool = CodeCity 1.00 0.78 0.10 A1 Azureus CodeCity 6.00 0.11 A2.1 0.91 0.33 0.26 A1 0.50 0.40 0.20 Treatments Completion time (minutes) 1.00 0.62 0.62 0.55 1.00 0.80 5.4 By combining the two levels of each of the two independent variables we obtain four treatments, illustrated in Table 4. Time T1: CodeCity, large T2: CodeCity, medium T3: Ecl+Excl, large T4: Ecl+Excl, medium 0.80 1.00 0.63 0.91 tool = CodeCity 0.98 0.95 0.96 0.80 0.60 0.40 Controlled Variables We identified two factors that could have an influence on the subjects’ performance, i.e., their background and experience level. The background represents the working context of a subject, which we divided into industry and academy. The second factor is experience level, which represents the domain expertise gained by each of the participants, divided into beginner and advanced. For academia, subjects below a PhD were considered beginners, while researchers (i.e., PhD students, post-docs and professors) were considered advanced. For industry, we considered that participants with up to three years of experience were beginners, and the rest advanced. We used a randomized block design, with background and experience level as blocking factors. We assigned each participant—based on personal information collected before the experiment—to one of the four categories (i.e., academy-beginner, academy-advanced, industry-beginner, and industry-advanced). We then randomly assigned one of the four treatments (i.e., combinations of tool and system size) to the participants in each category. 4 0.20 tool = Ecl+Excl Tool 8.00 5.46 4.00 A2.2 0.86 0.96 0.78 0.95 tool = Ecl+Excl Correctness (points) Medium object system 5.00 time 9.2 Azureus 454’387 520 4’656 111 55 256 http://findbugs.sourceforge.net http://azureus.sourceforge.net Analysis Results on Correctness Interaction effect between tool and system size on correctness. It is important that there is no interaction between the two factors, which could have affected the correctness. The interaction effect of tool and system size on correctness was not significant, F (1, 37) = .034, p = .862. The data shows no evidence that the variation in correctness between CodeCity and Ecl+Exl depends on the size of the system, which strengthens any observed effect of the tool factor on the correctness. The effect of tool on correctness. There was a significant main effect of the tool on the correctness of the solutions, F (1, 37) = 14.722, p = .001, indicating that the mean correctness score obtained by CodeCity users was significantly higher than the one for Ecl+Exl users, regardless of the size of the object system. Overall, there was an increase in correctness of 24.26% for CodeCity users (M = 5.968, SD = 1.294) over Ecl+Exl users (M = 4.803, SD = 1.349). In the case of the medium size system, there was a 23.27% increase in correctness of CodeCity users (M = 6.733, SD = .959) over Ecl+Exl users (M = 5.462, SD = 1.147), while in the case of the large size system, the increase in correctness was 29.62% for CodeCity users (M = 5.050, SD = 1.031) over Ecl+Exl users (M = 3.896, SD = 1.085). The data shows that the increase in correctness for CodeCity over Ecl+Exl was higher for the larger system. The effect of system size on correctness. Although not the object of the experiment, an expected significant main effect of system size on the correctness of the solutions was observed, F (1, 37) = 26.453, p < .001, indicating that the correctness score was significantly higher for users performing the analysis on the medium size system than for users performing the analysis on the large size system, regardless of the tool they used to solve the tasks. The main effect of both tool and object system size on correctness and the lack of the effect of interaction between tool and object system size on correctness are illustrated in Figure 6, as well as the correctness box plots for the four treatments. While some of the subjects assigned to CodeCity have little or no experience with Eclipse, every subject assigned to Ecl+Exl is at least knowledgeable in using this IDE. 15 Legend: Experience Level Eclipse+Excel none beginner knowledgeable advanced expert FindBugs 93’310 53 1’320 62 9 67 Table 3: The two object systems 2 3 Correctness (points) A3 5 12 3 1 0 Correctness A2.2 7 10 4 0 0 Correctness A2.1 7 7 3 4 0 Frequency A1 tool = CodeCity Frequency Preliminary Data Analysis Frequency Correctness Data To convert the collected task solutions into quantitative information, we needed an oracle, which would provide both the set of correct answers and the grading scheme for each task. Having two object systems and two data sources (source code for the control group and a model of the system for the experimental group), led to four oracle sets. To build them, two of the authors and a third experimenter solved the tasks with each treatment and designed the grading scheme for each task. In addition, for the two experimental treatments, we computed the results using queries on the model of the object system, to make sure that we were not missing any detail because of particularities of the visual presentation (e.g., occlusion, too small buildings, etc.). Eventually, we discussed the divergences and merged our solutions. Finally, we needed to grade the solution of the subjects. We employed blind marking to rule out bias; when grading a solution the experimenter does not know whether the subject that provided the solution has used an experimental or a control treatment. For this, one of the authors created four code names for the groups and created a mapping between groups and code names, known only to him. Then he provided the other two experimenters with the encoded data, along with the encoded corresponding oracle, which allowed them to perform blind grading. In addition, the author who encoded the data performed his grading unblinded. Eventually, the authors discussed the differences and converged towards the final grading. The minimum grade was 0, while the maximum was 8. DATA ANALYSIS Difficulty trivial simple intermediate difficult impossible Frequency 7.3 8. 8.1 We observed an exceptional condition related to task A4.2, which had the highest discrepancy in time and correctness between control and experimental groups. The data points showed that the experimental group was not able to solve this task, while the control group was quite successful at solving it. The experimental group had an average of 0.06, with 19 null scores (out of 22), and most subjects used up the entire allotted time ceiling effect). The control trivial simple (i.e., intermediate difficult impossible group had an average of 0.86 with 15 perfect scores (out of 19) and 12 9 most subjects finished in roughly half the allotted time. The subjects 6 3 its difficulty accordingly: In the debriefing questionnaire, perceived 0 experimental graded “impossible”, while the A1 subjects A2.1 A2.2 A3 task A4.1 A4.2 A4.2 asB1.1 B1.2 B2.1 B2.2 Task control group described it as “simple”, as shown in Figure 4. Completion time Timing Data Completion time Personal Information Frequency DATA COLLECTION Frequency Frequency Frequency 7.2 Frequency Frequency 7.1 Object Systems We chose two Java systems, both large enough to potentially benefit from visualization, yet of different size, so that we can reason about this independent variable. The smaller of the two systems is FindBugs2 , a tool using static analysis to find bugs in Java code, developed as an academic project at the University of Maryland, while the larger system is Azureus3 , a popular P2P file sharing client and one of the most active open-source projects hosted at SourceForge. In Table 3, we present the main characteristics of the two systems related to the tasks of the experiment. Dependent and Independent Variables The purpose of the experiment is to show whether CodeCity’s 3D visualizations provide better support to software practitioners in solving program comprehension tasks than state-of-the-practice exploration tools. Additionally, we are interested in how well CodeCity performs compared to the baseline when analyzing systems of different magnitudes. Consequently, our experiment has two independent variables: the tool used to solve the tasks and the object system size. The tool variable has two levels, i.e., CodeCity and a baseline, chosen based on the criteria described in Section 5.2.1. The object system size has two levels, i.e., medium and large, because visualization starts to become useful only when the analyzed system has a reasonable size. The object systems chosen to represent these two treatments are presented in Section 5.2.2. Similarly to other empirical evaluations of software visualization approaches [3, 10, 1], the dependent variables of our experiment are correctness of the task solution (i.e., a measure of effectiveness) and completion time (i.e., measure of efficiency). The design of our experiment is a between-subjects design, i.e., a subject is part of either the control group or of the experimental group. Correctness T1: CodeCity, large T2: CodeCity, medium T3: Ecl+Excl, large T4: Ecl+Excl, medium 7. We collected data continuously—before, during, and after the experiment. Before the experiment, we collected both personal information (e.g., gender, age, nationality) and professional data (e.g., job position, experience with object-oriented programming, Java, Eclipse, and reverse engineering) by means of an online questionnaire, to be found in [20]. To time participants accurately, we developed our own timing web application in Smalltalk. During the experimental sessions, the timing application would run on the experimenter’s computer and project the current time. The displayed time was used as common reference by the participants whenever they were required in the questionnaire to log the time. In addition, the application displayed, for each participant, the name, current task, and the remaining time allotted for the task. The subjects were asked to announce to the experimenter every time they logged the time, so that the experimenter could reset their personal timer by clicking on the hyperlink marked with the name of the subject. Whenever a subject was unable to finish a task in the allotted time (10 minutes for each task), the application would display the message “overtime” beside the name, and the experimenter would ask the subject to immediately pass to the next task, before resetting the timer. 5.3 Finding a Baseline There is a subtle interdependency between the baseline and the set of tasks for the experiment. In an ideal world, we would have devised tasks for each of the three contexts in which we applied our approach: software understanding, evolution analysis, and design quality assessment. Instead, we had to settle for a reasonable compromise. We looked for two characteristics in an appropriate baseline: data & feature compatibility with CodeCity and recognition from the community (i.e., a state-of-the-practice tool). Unfortunately we could not find a single tool satisfying both criteria. To allow a fair comparison, without having to limit the task range, we opted to build a baseline from several tools. The baseline needed to provide exploration and querying functionality, support for presenting at least the most important software metrics, support for design problems exploration, and if possible support for evolutionary analysis. In spite of the many existing software analysis and visualization approaches, software understanding is still mainly performed at the source code level. Since the most common source code exploration tools are integrated development environments (IDEs), we chose Eclipse, a popular IDE in both academia and industry, which represents the current state-of-the-practice. The next step was finding support for exploring meta-data, such as software metrics and design problem data, since they were not available in Eclipse. We looked for a convenient Eclipse plugin for metrics or even an external metrics tool, but could not find one that fit our requirements (including support for user defined metrics). Since we did not want to confer an unfair data advantage to the subjects in the experimental group, we chose to provide the control group with tables containing the metrics and design problem data, and the popular Excel spreadsheet application for exploring the data. Finally, due to Eclipse’s lack of support for multiple versions, we decided to exclude the evolution analysis from our evaluation, although we consider it one of the strong points of our approach. Providing the users with several projects in Eclipse representing different versions of the same system, with no relation among them (or even worse, with just a versioning repository), would have been unfair. Q1 : Does the use of CodeCity increase the correctness of the solutions to program comprehension tasks, compared to nonvisual exploration tools, regardless of the object system’s size? Q2 : Does the use of CodeCity reduce the time needed to solve program comprehension tasks, compared to non-visual exploration tools, regardless of the object system’s size? 3. Take into account the range of experience level of the participants. Experience can greatly influence the outcome of the experiment. 5. Find a set of relevant tasks. The tasks should be close in scope and complexity to real tasks performed by practitioners. Object system(s) Classes kLOC Wettel et al. [20] Figure 2: Example of disharmony map (a part of JDK 1.5) 5.1 2. Involve participants from industry. Any approach devised to support practitioners in their work is best evaluated using a subject sample with a fair share of software practitioners. 4. Provide a tutorial of the experimental tool to the participants. The experimental group would require intensive training to even come close to the skills of the control group acquired in years of operation. Therefore, a minimal exercising of the experimental group in using the features required to solve the tasks is paramount. java.utils CodeCity also supports software evolution analysis activities [18], which for reasons we explain later we did not evaluate. Tool support. Our approach is implemented in CodeCity [17], a freely available1 standalone tool programmed in Smalltalk and depicted in Figure 1. Apart from the main visualization panel, there is an information panel providing contextual data on the selected city artifact (e.g., name, metric values, etc.). CodeCity supports a broad range of facilities to interact, such as inspection or querying, and navigate the visualization. Although not integrated in an IDE, CodeCity features unidirectional linking between the visualization and the source code, i.e., the source code of the software element represented by a city artifact can be accessed in an editor from the visualization. CodeCity is customizable: the user can compose a new view by choosing from a set of metrics and artifact representations. 1 1. Choose a fair baseline for comparison. If one wants to provide evidence for the value of an approach the question of what one compares it to has to be answered unambiguously. NOM: 71 NOA: 81 Experiment EXPERIMENTAL DESIGN WISH LIST We conducted a very exhaustive survey of research works dealing with experimental validation of software engineering, information visualization, and software visualization approaches. The survey, which included more than 50 articles and various books, is detailed in a technical report [20]. From the survey we distilled the following experimental design wish list, which we kept in mind as we designed and conducted the experiment: Correctness (points) 2. Software Systems as Cities: A Controlled Experiment Software visualization is a popular program comprehension technique used in the context of software maintenance, reverse engineering, and software evolution analysis. While there is a broad range of software visualization approaches, only few have been empirically evaluated. This is detrimental to the acceptance of software visualization in both the academic and the industrial world. We present a controlled experiment for the empirical evaluation of a 3D software visualization approach based on a city metaphor and implemented in a tool called CodeCity. The goal is to provide experimental evidence of the viability of our approach in the context of program comprehension by having subjects perform tasks related to program comprehension. We designed our experiment based on lessons extracted from the current body of research. We conducted the experiment in four locations across three countries, involving 41 participants from both academia and industry. The experiment shows that CodeCity leads to a statistically significant increase in terms of task correctness and decrease in task completion time. We detail the experiment we performed, discuss its results and reflect on the many lessons learned. PLEIAD @ DCC University of Chile [email protected] It has earned a reputation as an effective program comprehension technique, and is widely used in the context of software maintenance, reverse engineering and reengineering [2]. The last decade has witnessed a proliferation of software visualization approaches, aimed at supporting a broad range of software engineering activities. As a consequence, there is a growing need for objective assessments of visualization approaches to demonstrate their effectiveness. Unfortunately, only few software visualization approaches have been empirically validated so far, which is detrimental to the development of the field [2]. One of the reasons behind the shortage of empirical evaluation lies in the difficulty of performing a controlled experiment for software visualization. On the one hand, the variety of software visualization approaches makes it almost impossible to reuse the design of an experiment from the current body of research. On the other hand, organizing and conducting a proper controlled experiment is, in itself, a difficult endeavor, which can fail in many ways: data which does not support a true hypothesis, conclusions which are not statistically significant due to insufficient data points, etc. We present a controlled experiment for the empirical evaluation of a software visualization approach based on a city metaphor [16]. The aim is to show that our approach, implemented in a tool called CodeCity, is at least as effective and efficient as the state of the practice at supporting reverse engineering and program comprehension activities. We conceived a set of tasks and measured both the correctness of the task solutions and the task completion time. We conducted the experiment in four locations across three countries, with participants from both industry and academia, with a broad range of experience. In this paper we make the following two major contributions: 1. we detail the experiment design and its operation, reporting on a number of lessons learned regarding the many pitfalls that this type of experiment entails, and Software visualization, Empirical validation Permission to make digital or hard copies of all or part of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. To copy otherwise, to republish, to post on servers or to redistribute to lists, requires prior specific permission and/or a fee. ICSE ’11, May 21–28 2011, Honolulu, Hawaii Copyright 2011 ACM 978-1-4503-0445-0/11/05 ...$10.00. ABSTRACT Romain Robbes REVEAL @ Faculty of Informatics University of Lugano {richard.wettel,michele.lanza}@usi.ch ! B2.1 2.14 1.88 2.65 2.10 1.69 2.52 3.91 3.14 1 3 1 1 1 1 1 1 1 1 3 2 1 R R 1 1 1 R 6 1 Antwerp n m R remote, self-controlled session 5 6 Bern 4 6 During the experimental session(s), the subjects solve the tasks with the assigned tool on the assigned object system, under the experimenter’s observation. The numbers in Figure 3 reflect only the participants whose data points were taken into account during the analysis, and not those excluded from it, which we discuss later. 6.1 Bologna Training session followed by an experimental session with n subjects with CodeCity (experimental) m subjects with Eclipse+Excel (control) Legend: R 1 Figure 3: The timeline of the experiment The operation covered the period Nov 2009–Apr 2010 and was divided in two phases: the pilot and the experiment. Figure 3 shows the timeline of the operation. The operation is composed of a series of experimental runs; each run consists of a one hour training session, followed by one or more experimental sessions of up to two hours. A training session consists of a talk in which the experimenter presents the approach, concluded with a CodeCity demonstration. The Pilot Study We conducted a pilot study with Master students of the University of Lugano enrolled in a course on Software Design; improving the questionnaire and solving problems as they emerged required several iterations. We organized a joint training session, followed by three experimental sessions, each one week apart. Before the first session we conducted the experiment with a researcher from our group, with extensive experience with Eclipse, to make sure the tasks for the control group were doable in the allotted time. We did not include any of these data points in the analysis. 6.2 Experimental Runs At this point, we were confident enough to start our experiment. We performed the following experimental runs: Bologna I. 8 professionals with 4–10 years of experience. Bologna II. 9 professionals with 7–20 years of experience. Lugano I. 1 researcher/development leader of a small company. Lugano II & III. 5 practioners with 10+ years of experience. Antwerp. 3 Ph.D. and 8 MSc students. Bern. 2 consultants, 1 professor, 7 Ph.D. and 1 MSc student. ...happily! ever after? 5,143 citations 5,143 citations ! S L L u ! B T * * h-index 37 Impact > Manhattan ! manhattan.inf.usi.ch Academics @ Work conflict Courtesy of Adrian Kuhn & Oscar Nierstrasz (Univ. of Bern, Switzerland) ! Eclipse Courtesy of Claus Lewerentz & Frank Steinbrückner (TU Cottbus, Germany) ! Academic Research Part III Transcendence Industrial Reality review rearchitect version tasks design edit edit reuse run compile RSS document run Twitter IRC debug Skype diff Facebook recommend refactor navigate visualize blog compile e-mail test reengineer ! ! ! ! for(int j=m; j>i; j--){ uCJM1= dataUC[j-1]; uCJ= dataUC[j]; if(uCJM1.compare(z)> { /* exchange */ tempStr= data[j-1]; /* sort the data */ data[j-1]= data[j]; data[j]= tempStr; dataUC[j-1]= uCJ; dataUC[j]= uCJM1; for(int j=m; j>i; j--){ uCJM1= dataUC[j-1]; uCJ= dataUC[j]; if(uCJM1.compare(z)> { /* exchange */ tempStr= data[j-1]; /* sort the data */ data[j-1]= data[j]; data[j]= tempStr; dataUC[j-1]= uCJ; dataUC[j]= uCJM1; } } } } ! ! ! ! ! (1) (2) (3) (4) (5) (6) ! (7) (8) (9) (10) (11) ! Alice wrote: On Mon 23, Bob wrote: Dear list, When starting up ArgoUML on my MacOS X system (Java 2) it throws a NullPointerException very soon. You'll find the trace below. I hope someone knows a solution. Thanks a lot! Exception in thread "main" java.lang.NullPointerException at javax.swing.event.SwingSupport.fireChange(SwingChange.java) at javax.swing.AbstractAction.setEnabled(AbstractAction.java) [...] at uci.uml.Main.main(Main.java:148) (12) (13) (14) (15) I'm sorry I can't help you Bob but thanks for sharing the stack... Alice. -"Beware of programmers who carry screwdrivers." --L. Brandwein (16) (17) (18) (19) (20) Alice, I believe the flawed Explorer.java class generates Bob's issue: public void setEnclosingFig(Fig each) { super.setEnclosingFig(each); if (each != null || (each.getOwner() instanceof MPackage)) { m = (MPackage) each.getOwner(); } (21) (22) (23) (24) (25) (26) (27) (28) The problem is in the condition, I attach the diff with this version: --- src/org/argouml/ui/explorer/Explorer.java (revision 14338) +++ src/org/argouml/ui/explorer/Explorer.java (working copy) @@ -147,1 +147,1 @@ [...] super.setEnclosingFig(each); - if (each != null || (each.getOwner() instanceof MPackage)) { + if (each != null && (each.getOwner() instanceof MPackage)) { m = (MPackage) each.getOwner(); } (29) (30) (31) (32) (33) (34) Probably ModelTree is also affected, if so, please change it =) Cheers, Carl. -- I used to have a sig, but it took up much space so I got rid of it! --------------------------------------------------------------------To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected] ! ! ! Non-relevant Natural Language Stack trace Source code Patch The content of unstructured data produced during the evolution of a software system is a valuable information source to support software understanding and evolution and complements data mined from structured sources. ! Alberto Bacchelli ! Mining Unstructured Software Data Alice wrote: > On Mon 23, Bob wrote: > > Dear list, > > When starting up ArgoUML on my MacOS X system (Java 2) > > it throws a NullPointerException very soon. You'll find the > > trace below. I hope someone knows a solution. Thanks a lot! > > > > > > > > Exception in thread "main" java.lang.NullPointerException at javax.swing.event.SwingSupport.fireChange(SwingChange.java) at javax.swing.AbstractAction.setEnabled(AbstractAction.java) [...] > > at uci.uml.Main.main(Main.java:148) > > > > I'm sorry I can't help you Bob but thanks for sharing the stack... Alice. -"Beware of programmers who carry screwdrivers." --L. Brandwein Alice, I believe the flawed Explorer.java class generates Bob's issue: public void setEnclosingFig(Fig each) { super.setEnclosingFig(each); if (each != null || (each.getOwner() instanceof MPackage)) { m = (MPackage) each.getOwner(); } The problem is in the condition, I attach the diff with this version: --- src/org/argouml/ui/explorer/Explorer.java (revision 14338) +++ src/org/argouml/ui/explorer/Explorer.java (working copy) @@ -147,1 +147,1 @@ [...] super.setEnclosingFig(each); - if (each != null || (each.getOwner() instanceof MPackage)) { + if (each != null && (each.getOwner() instanceof MPackage)) { m = (MPackage) each.getOwner(); } Probably ModelTree is also affected, if so, please change it =) Cheers, Carl. -- I used to have a sig, but it took up much space so I got rid of it! --------------------------------------------------------------------To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected] Recommender Systems for Software Engineering “RSSEs are software applications that provide information estimated to be valuable for a software engineering task in a given context” ! PhD Thesis, University of Lugano, 2013 M. P. Robillard, R. J. Walker, and T. Zimmerman Recommender systems for software engineering IEEE Software, 2010 ! Next-gen Development Environments + Recommender Systems ! No Spontaneous Recommendation ! ! ! ! ! ! No Self-Confidence ! ! ! Intelligent/Immersive Development Environments > Prompter ! prompter.inf.usi.ch Make Weapons out of Imperfections The Librarian daemon looks like a pleasant, fiftyish, silver-haired, bearded man. Even though he's just a piece of software, the librarian has reason to be cheerful; he can move through the nearly infinite stacks of information in the Library with the agility of a spider dancing across a vast web of cross-references. The only thing he can't do is think. “Yes, sir," the Librarian says. He is eager without being obnoxious, he clasps his hands behind his back, rocks forward slightly on the balls of his feet, raises his eyebrows expectantly over his half-glasses. 80 60 Completeness ● 40 is effective in development tasks 20 Prompter 100 Development Task NP P Treatment Librarians are there to: help; aid; assist; teach; collate; enthuse; catalogue; index; arrange; organize; find; discover; promote; display; interest; intrigue; amuse; amaze; help children, adults, old people, the underprivileged, the rich, the poor, those with voices and those without; protect resources, archive them and save them for the future; provide differing viewpoints; engender thought, conversation, fun; provide the best answers possible and match the answer to the enquirer; provide just enough information without overwhelming the user. ! NP = Without Prompter P = With Prompter Google is there to: make money. The End ! …of the Beginning old habits die hard Denn wir sind wie Baumstämme im Schnee. Scheinbar liegen sie glatt auf, und mit kleinem Anstoss sollte man sie wegschieben können. Nein, das kann man nicht, denn sie sind fest mit dem Boden verbunden. Aber sieh, sogar das ist nur scheinbar. deicIDE ! The Rise and Fall of Integrated Development Environments ! Michele Lanza ! ! Franz Kafka, “Die Bäume”, 1913 REVEAL | Faculty of Informatics University of Lugano, Switzerland !