Exploratory Data Analysis

In this lab, we will go through various steps to explore a dataset using descriptive statistics, summary of data, different graphs, etc.

Factor Variables (try the following in R):

data = read.csv("patient.csv");data                     #Reading patient data

##    Patient Gender Age Group
## 1     Dick      M  20     2
## 2     Anna      F  25     1
## 3      Sam      M  30     3
## 4   Jennie      F  28     2
## 5     Joss      M  29     3
## 6      Don      M  21     2
## 7    Annie      F  26     1
## 8     John      M  32     3
## 9     Rose      F  27     2
## 10    Jack      M  31     3

data$Gender                                         #It is a string/character variable

##  [1] "M" "F" "M" "F" "M" "M" "F" "M" "F" "M"

data$Gender = factor(data$Gender,levels=c("M","F"), ordered=TRUE)  #Converted to factor
data$Gender

##  [1] M F M F M M F M F M
## Levels: M < F

data$Group                                        #It is a numerical variable

##  [1] 2 1 3 2 3 2 1 3 2 3

data$Group = as.factor(data$Group)      #Converted to factor
data$Group

##  [1] 2 1 3 2 3 2 1 3 2 3
## Levels: 1 2 3

Descriptive Statistics (try the following in R):

For character variable

Vehicles = c("Two-wheeler", "Four-wheeler", "Two-wheeler", "Four-wheeler", "Two-wheeler", "Four-wheeler", "Six-wheeler", "Two-wheeler") 

t = table(Vehicles); t      #Table shows the distribution of values count-wise

## Vehicles
## Four-wheeler  Six-wheeler  Two-wheeler 
##            3            1            4

pie(t) #Pie chart represents pictorially, how values are distributed

barplot(t, col=as.vector(t)) #Bar chart plots bars and counts to show the distribution

For numerical variable

Ratings = c(5.6, 3.4, 1.5, 9.3, 7.5, 6.5, 5.2, 2.8, 2.5, 2.9, 2.1, 1.0, 1.1, 0.0)

h = hist(Ratings, plot = F)

hist(Ratings, col=2+h$breaks) #Histogram shows frequency-distribution of different ranges of numbers

hist(Ratings, col=2+h$breaks, prob=TRUE) #Instead of frequency-distribution, probability-distribution can be plotted

breaks=seq(-0.1, 12, 3) #We can define our own range for plotting 

h2 = hist(Ratings, breaks=breaks, plot = F)

hist(Ratings, breaks=breaks, xlab="Average ratings for movies", main="Movie Ratings on scale 10", col=2+h2$breaks)

We can see that the breaks defined by us have plotted the bars in four different ranges -0.1 to 2.9, 2.9 to 5.9, 5.9 to 8.9 and 8.9 to 11.9

Calculating Mean, Median, Variance, Standard Deviation:

load("score.Rdata" )        #Load the score in R session
score                               #Display the data in score

##   [1] 373 216 178 348 319 236 356 134 326 379 312 378 128 262  86 154 266 214
##  [19] 339 138 338 389 375 379  91 160  69  93 392 119 170  89 221  74 297 247
##  [37]  88 347 329 209  63 179  94  71 255 279 242 153 375 304 375 153 152 380
##  [55]  62 345 266 225 394 328 159 133  78 190 301 270 157 353  82  94 198 336
##  [73] 151 194 378 392 388 167 395 353 156 113 273 152 365 100 270 339 187 178
##  [91] 348 331 192 334 219  97 253 344  73 230 263 345 274 388 395 212  92  50
## [109]  78 127 375 333 165 282 342 110 135 376 159 356 349  98 365 291 295 354
## [127] 355 296 288 268 184 160  69 100 102 179 353 114 349 154 173 126 267 147
## [145] 243  68 322  80 223 286 124  65  89 314 355 141 171 201 187 256 298 209
## [163] 278 189 175  63 165  65 320 179 114 322 261 176 182  90 377  85 354 346
## [181] 113 178 358 261 393 166 358 151  99 385 335 303 398 121 127 217 377 165
## [199] 162 283

mean(score)                         #Calculating mean of score

## [1] 231.26

x = c(10, 50, 30, 5, 58)
mean(x)                             #Mean of a random data

## [1] 30.6

median(score)                   #Calculating median of score

## [1] 222

median(x)                         #Median of a random data

## [1] 30

x1 = c(-10, 1,2,3,4,5,6,7,8,9,10,20 )
mean(x1, trim =0.1)     #Trims the 0.1 fraction of x1 from both extremes, then calculates mean

## [1] 5.5

Exercise: Change the order of the values in x1 and calculate trimmed mean. Try different values of trim.

range(x)                        #Calculates the rage of x

## [1]  5 58

range(score)                        #Calculates the range of score

## [1]  50 398

quantile( x, probs = 0.25)              #25th percentile

## 25% 
##  10

quantile(score, probs = .5)                 #50th percentile

## 50% 
## 222

quantile( x = score, probs = c(.25,.75) )           #25th and 75th percentile

##   25%   75% 
## 145.5 336.5

IQR(score )                         #Interquartile Range

## [1] 191

boxplot(score)                      #Boxplot of score

mean( (score - mean(score) )^2)             #Varience using formula

## [1] 11160.58

var(score)                      #Varience of score

## [1] 11216.67

#Difference between two variance values is due to different degrees of freedom
#The former uses n (i.e. 200) and the latter uses n-1 (i.e. 199)

sd( score)                      #Standard Deviation of score

## [1] 105.9088

Calculating Mean Absolute Deviation and Median Absolute Deviation

x_mean  = mean(x)             
AD  = abs(x - x_mean)                   #Absolute deviation         
AAD  = mean(AD)                     #Mean Absolute Deviation            
AAD

## [1] 18.72

mad(x)                          #Median Absolute Deviation

## [1] 29.652

mad(score)                      #Median Absolute Deviation

## [1] 146.7774

Summary of Numerical Variable (try the following in R):

summary(object = score)                 #Different measures in one display

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    50.0   145.5   222.0   231.3   336.5   398.0

boxplot(score)                            #boxplot of score

Summary of Categorical Variable (try the following in R):

load("countries.Rdata" )                #Loading data countries
countries

##   [1] India       India       India       India       India       India      
##   [7] India       India       India       India       India       India      
##  [13] India       India       India       India       India       India      
##  [19] India       India       Pakistan    Pakistan    Pakistan    Pakistan   
##  [25] Pakistan    Pakistan    Pakistan    Pakistan    Pakistan    Pakistan   
##  [31] Pakistan    Pakistan    Pakistan    Pakistan    Pakistan    Pakistan   
##  [37] Pakistan    Pakistan    UK          UK          UK          UK         
##  [43] UK          UK          UK          UK          UK          UK         
##  [49] UK          UK          UK          UK          UK          UK         
##  [55] UK          UK          UK          UK          UK          UK         
##  [61] UK          UK          UK          UK          UK          UK         
##  [67] UK          UK          Australia   Australia   Australia   Australia  
##  [73] Australia   Australia   Australia   Australia   Australia   Australia  
##  [79] Australia   Australia   Australia   Australia   Australia   Australia  
##  [85] Australia   Australia   Australia   Australia   Australia   Australia  
##  [91] Australia   Australia   Australia   Australia   Australia   Australia  
##  [97] Australia   Australia   Australia   Australia   Australia   Australia  
## [103] Australia   Australia   Australia   Australia   Australia   Australia  
## [109] Sri Lanka   Sri Lanka   Sri Lanka   Sri Lanka   Sri Lanka   Sri Lanka  
## [115] Sri Lanka   Sri Lanka   Sri Lanka   Sri Lanka   Sri Lanka   Sri Lanka  
## [121] Sri Lanka   Sri Lanka   Sri Lanka   New Zealand New Zealand New Zealand
## [127] New Zealand New Zealand New Zealand New Zealand New Zealand New Zealand
## [133] New Zealand New Zealand New Zealand New Zealand New Zealand New Zealand
## [139] New Zealand New Zealand New Zealand New Zealand New Zealand New Zealand
## [145] New Zealand New Zealand New Zealand New Zealand
## Levels: Australia India New Zealand Pakistan Sri Lanka UK

summary( object = countries )               #Summary of the data

##   Australia       India New Zealand    Pakistan   Sri Lanka          UK 
##          40          20          25          18          15          30

t=table(countries); t           #Table and summary display the same result

## countries
##   Australia       India New Zealand    Pakistan   Sri Lanka          UK 
##          40          20          25          18          15          30

pie(t)          #Plotting the pie chart of the data

barplot(t, col=as.vector(t))        #Bar chart of data

Summary of dataset (try the following in R):

clinicalData = read.csv("clinicalData.csv")

clinicalData

##       treatment medicine euthymia
## 1   noTreatment     med2      0.3
## 2   noTreatment     med1      1.7
## 3  newTreatment     med2      0.7
## 4  newTreatment     med1      0.3
## 5  newTreatment     med2      1.5
## 6   noTreatment     med2      0.5
## 7  newTreatment     med1      1.1
## 8  newTreatment     med1      0.3
## 9   noTreatment  placebo      0.1
## 10  noTreatment  placebo      1.9
## 11  noTreatment  placebo      1.9
## 12 newTreatment  placebo      0.1
## 13  noTreatment     med1      1.7
## 14  noTreatment  placebo      1.7
## 15 newTreatment     med1      1.5
## 16  noTreatment  placebo      0.9
## 17 newTreatment     med1      1.9
## 18 newTreatment  placebo      1.7
## 19 newTreatment     med2      0.1
## 20  noTreatment     med1      1.3

summary(clinicalData)

##   treatment           medicine            euthymia   
##  Length:20          Length:20          Min.   :0.10  
##  Class :character   Class :character   1st Qu.:0.30  
##  Mode  :character   Mode  :character   Median :1.20  
##                                        Mean   :1.06  
##                                        3rd Qu.:1.70  
##                                        Max.   :1.90

Exploring variables in a dataset (try the following in R):

# Effect of medicines on euthymia
aggregate( euthymia ~ medicine, data = clinicalData, FUN = mean)

##   medicine euthymia
## 1     med1 1.225000
## 2     med2 0.620000
## 3  placebo 1.185714

#Effect of treatment on euthymia
boxplot(euthymia ~ treatment, data=clinicalData, col= 2:1+length(unique(clinicalData$treatment)))

#Effect of treatment and medicine combined together on euthymia
aggregate( formula = euthymia ~ medicine + treatment,   data = clinicalData, FUN = mean)

##   medicine    treatment  euthymia
## 1     med1 newTreatment 1.0200000
## 2     med2 newTreatment 0.7666667
## 3  placebo newTreatment 0.9000000
## 4     med1  noTreatment 1.5666667
## 5     med2  noTreatment 0.4000000
## 6  placebo  noTreatment 1.3000000

Exercise: Try the 1st and 3rd relations using median as a function

#Summary of the dataset by treatment
by(data = clinicalData, INDICES = clinicalData$treatment, FUN = summary)

## clinicalData$treatment: newTreatment
##   treatment           medicine            euthymia   
##  Length:10          Length:10          Min.   :0.10  
##  Class :character   Class :character   1st Qu.:0.30  
##  Mode  :character   Mode  :character   Median :0.90  
##                                        Mean   :0.92  
##                                        3rd Qu.:1.50  
##                                        Max.   :1.90  
## ------------------------------------------------------------ 
## clinicalData$treatment: noTreatment
##   treatment           medicine            euthymia  
##  Length:10          Length:10          Min.   :0.1  
##  Class :character   Class :character   1st Qu.:0.6  
##  Mode  :character   Mode  :character   Median :1.5  
##                                        Mean   :1.2  
##                                        3rd Qu.:1.7  
##                                        Max.   :1.9

#Barplot between treatment and medicine vs euthymia
Tb = table(clinicalData$treatment, clinicalData$medicine)
barplot(Tb, col=Tb[,2], legend.text=TRUE, main= "Treatments and Medicine to different patients" )

#The above only gives counts of patients, but we want to see the effect on euthymia

#Plotting bar chart of Euthymia vs Treatment and Medicine
treat_result = with(clinicalData, tapply(euthymia, list(treatment, medicine), FUN = mean))

barplot(treat_result, beside = TRUE, col= round(treat_result[,3]+1.3), legend.text=TRUE, main= "Euthymia vs Treatments and Medicine" )

Exercise: Try to learn about tapply, sapply, lapply, vapply and apply functions in R

Relation between two numeric variables

We will use the dataset “Boston” from package MASS

library(MASS)                   #It has many datasets for our practice

head(Boston)                #First six rows of data Boston from MASS package

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7

plot(Boston$age, Boston$tax)        #To see relation between age and tax of houses

#Correlation between crime rate per capita and pupil-teacher ratio by town
cor( x = Boston$crim, y = Boston$ptratio)

## [1] 0.2899456

#Correlation values of all possible pairs
cor(x = Boston)

##                crim          zn       indus         chas         nox
## crim     1.00000000 -0.20046922  0.40658341 -0.055891582  0.42097171
## zn      -0.20046922  1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus    0.40658341 -0.53382819  1.00000000  0.062938027  0.76365145
## chas    -0.05589158 -0.04269672  0.06293803  1.000000000  0.09120281
## nox      0.42097171 -0.51660371  0.76365145  0.091202807  1.00000000
## rm      -0.21924670  0.31199059 -0.39167585  0.091251225 -0.30218819
## age      0.35273425 -0.56953734  0.64477851  0.086517774  0.73147010
## dis     -0.37967009  0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad      0.62550515 -0.31194783  0.59512927 -0.007368241  0.61144056
## tax      0.58276431 -0.31456332  0.72076018 -0.035586518  0.66802320
## ptratio  0.28994558 -0.39167855  0.38324756 -0.121515174  0.18893268
## black   -0.38506394  0.17552032 -0.35697654  0.048788485 -0.38005064
## lstat    0.45562148 -0.41299457  0.60379972 -0.053929298  0.59087892
## medv    -0.38830461  0.36044534 -0.48372516  0.175260177 -0.42732077
##                  rm         age         dis          rad         tax    ptratio
## crim    -0.21924670  0.35273425 -0.37967009  0.625505145  0.58276431  0.2899456
## zn       0.31199059 -0.56953734  0.66440822 -0.311947826 -0.31456332 -0.3916785
## indus   -0.39167585  0.64477851 -0.70802699  0.595129275  0.72076018  0.3832476
## chas     0.09125123  0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## nox     -0.30218819  0.73147010 -0.76923011  0.611440563  0.66802320  0.1889327
## rm       1.00000000 -0.24026493  0.20524621 -0.209846668 -0.29204783 -0.3555015
## age     -0.24026493  1.00000000 -0.74788054  0.456022452  0.50645559  0.2615150
## dis      0.20524621 -0.74788054  1.00000000 -0.494587930 -0.53443158 -0.2324705
## rad     -0.20984667  0.45602245 -0.49458793  1.000000000  0.91022819  0.4647412
## tax     -0.29204783  0.50645559 -0.53443158  0.910228189  1.00000000  0.4608530
## ptratio -0.35550149  0.26151501 -0.23247054  0.464741179  0.46085304  1.0000000
## black    0.12806864 -0.27353398  0.29151167 -0.444412816 -0.44180801 -0.1773833
## lstat   -0.61380827  0.60233853 -0.49699583  0.488676335  0.54399341  0.3740443
## medv     0.69535995 -0.37695457  0.24992873 -0.381626231 -0.46853593 -0.5077867
##               black      lstat       medv
## crim    -0.38506394  0.4556215 -0.3883046
## zn       0.17552032 -0.4129946  0.3604453
## indus   -0.35697654  0.6037997 -0.4837252
## chas     0.04878848 -0.0539293  0.1752602
## nox     -0.38005064  0.5908789 -0.4273208
## rm       0.12806864 -0.6138083  0.6953599
## age     -0.27353398  0.6023385 -0.3769546
## dis      0.29151167 -0.4969958  0.2499287
## rad     -0.44441282  0.4886763 -0.3816262
## tax     -0.44180801  0.5439934 -0.4685359
## ptratio -0.17738330  0.3740443 -0.5077867
## black    1.00000000 -0.3660869  0.3334608
## lstat   -0.36608690  1.0000000 -0.7376627
## medv     0.33346082 -0.7376627  1.0000000

Exercise: Go online and search for the details of Boston data in R. Find out what this data is all about and what does each variable mean. Plot all the pairs as well as individually calculate their correlation values. Try to understand their relationships.

#Compare correlation values of original data and ranks of data
wages = read.csv("Wages.csv")
wages

##    daysWorked wagesEarned
## 1           4          16
## 2          80         100
## 3          45          85
## 4           5          13
## 5          18          23
## 6          30          78
## 7          32          50
## 8          60          90
## 9          48          86
## 10         70          92

cor( wages$daysWorked, wages$wagesEarned )

## [1] 0.9217775

#Instead of using numerical values, we can use the ranks of observations to calculate corr.
d.rank = rank(wages$daysWorked)
d.rank                          #Ranks of observations in daysWorked

##  [1]  1 10  6  2  3  4  5  8  7  9

w.rank = rank(wages$wagesEarned )
w.rank                      #Ranks of observations in wagesEarned

##  [1]  2 10  6  1  3  5  4  8  7  9

cor( d.rank, w.rank )

## [1] 0.9757576

Exercise: Using clinicalData example, go through all the plotting and different techniques used in this lab (bar plots, pie charts, histogram, boxplot, aggregate, summary, mean, median, correlation, etc.). Based on these plotting and calculations, draw your inferences about the data (the stories it tells you). This is how you perform exploratory data analysis.

Use different data sets from MASS package to perform exploratory data analysis

Sampling (try the following in R):

Sometimes, we need to generate samples for creating simulated data for our experiments. We need to use set.seed() for the reproducibility of the results.

set.seed(1)
x=c(1:100)
S = sample(x, 50, replace=TRUE)     #sample() generates values in a random order, replace = TRUE, repeats some values
S

##  [1] 68 39  1 34 87 43 14 82 59 51 97 85 21 54 74  7 73 79 85 37 89 37 34 89 44
## [26] 79 33 84 35 70 74 42 38 20 28 20 44 87 70 40 44 25 70 39 51 42  6 24 32 14

nd = rnorm(200, mean=5, sd=2)   #rnorm() generates random normal data
nd

##   [1]  4.8923899  2.2458809  4.1700109  4.2114201  4.8813732  7.2000507
##   [7]  6.5263515  4.6709528  4.4932766  6.3939268  6.1133264  3.6224886
##  [13]  3.5850097  5.7291639  6.5370658  4.7753076  6.7622155  5.7962118
##  [19]  3.7759472  5.6822394  2.7412738  7.8660474  8.9607998  4.2655570
##  [25]  2.9117307  6.1394393  4.7298908  9.8032355  4.9215200  6.3794787
##  [31]  5.0560043  3.5134536  5.3775846  1.3900827  7.9311097  5.3065067
##  [37]  9.3452233  5.9510191  3.5801071  6.2214527  3.1318047  2.4927332
##  [43]  5.5828925  4.1134163  5.0022107  5.1486826  3.8209581  3.8626625
##  [49]  4.7296428  7.3561740  1.9528664  6.1878924  5.6659007  7.1261997
##  [55]  4.3916322  5.7400376  5.5341976  3.9149599  7.4157356  7.3208052
##  [61]  6.4004273  8.1736669  6.1169729  2.4468156  3.8534692  2.5507748
##  [67]  4.0531987  3.7592666  5.0842317  3.1781567  5.3160575  3.6908307
##  [73]  8.5345745  6.4334150  6.8203485  5.7683707  8.3643522  3.7285271
##  [79]  4.0767105  7.8645645  3.6986073  4.5852385  4.2143841  4.3600143
##  [85]  4.4417734  5.9883767  4.6453390  3.9880851  7.6860777  4.5708412
##  [91]  4.6408869  4.7996185  6.4253326  4.8528712  4.9247317  3.6366790
##  [97]  4.3514595  5.1203209  3.8222110  6.0629924  1.9632118  5.6131157
## [103]  1.9271004  4.3980477  3.9434402  3.6958104  4.8862064  1.1712811
## [109]  7.3531666  1.6700551  4.0729392  2.7681598  3.4983620  9.1743331
## [115]  5.0347912  2.4273989  1.7187889  5.9003742  4.9628803  4.3638633
## [121]  3.1412757  2.0250794  2.8496154  7.0000576  3.7574666  2.2311463
## [127]  8.7385812  5.8502008  4.5227058  7.1169661  6.7728453  3.7615139
## [133]  9.4122049  4.4899459  2.1510107  4.7112008  5.4150767  9.6159568
## [139]  5.2116047  5.9139976  4.8456941  4.3319983  4.9305479  6.5752792
## [145]  9.1504900  7.0547849  7.4158168  2.5373532  6.9677911  5.4398496
## [151]  2.0654999  6.0420455  4.6824908  7.9291746  3.4678360  4.1395765
## [157]  3.1477810  4.6457921  5.8040236  3.5365037  6.6607463  2.5838344
## [163]  2.9040312  7.8823154  2.9683051  5.8239494  4.2378479  5.8188037
## [169]  8.3777466  8.1731769  4.3381844  0.4295289  9.9953232  6.3341323
## [175]  6.0826547  4.9732010  6.0202168  4.6712483  5.8413893  4.1995065
## [181]  2.2595842  6.9756765  8.0394901  4.3825189  2.4934205  6.2844826
## [187]  4.9105817  1.5335632  5.0042637  3.7393993  4.3180628  2.6868553
## [193]  8.6062838  4.3377359  1.7889732  5.3943869  5.5263513  3.0283466
## [199] -0.7778413  3.7190366

h=hist(nd, plot=F)
hist(nd, col=1+h$breaks)

Snd = sample(nd,50)             #Combining rnorm and sample
Snd

##  [1] 3.8534692 2.6868553 2.2595842 9.4122049 3.6958104 0.4295289 7.9311097
##  [8] 3.1412757 8.9607998 3.1477810 1.7187889 4.3377359 7.8823154 4.9628803
## [15] 3.8222110 2.9040312 2.5373532 5.4398496 8.6062838 8.3777466 5.9883767
## [22] 1.1712811 5.7962118 4.1134163 4.2378479 4.9215200 4.6408869 1.5335632
## [29] 3.5134536 1.9528664 4.9105817 4.3381844 3.7574666 7.6860777 6.9677911
## [36] 2.7412738 4.0767105 5.0347912 1.9632118 3.7285271 5.3160575 5.8502008
## [43] 7.4157356 4.5708412 3.9149599 6.5370658 9.1743331 2.4468156 4.4899459
## [50] 1.3900827

rnf = runif(500)                #runif() generates random uniform data
h=hist(rnf, plot=F)
hist(rnf, col=h$counts)

Exercise: Go online and learn about the importance of generating random data in data science and what are the various options in R to do so.

Business and AI

Search This Blog