Data Cleaning and Preprocessing 1

In this lab, we will learn how to hanlde missing values, how to standardize/normalize numeric variables in a dataset, how to handle outliers.

Handling Missing Values (try the following in R):

data = read.csv("data_missing.csv", stringsAsFactors = FALSE, na.strings= c("NA", "")) 
#Note: na.strings = “” allows empty rows of character variables to be read as “na”

data

##    Exp     Name Income.in.thousands
## 1   12      Raj                  40
## 2   31     Ajay                  60
## 3   25  Raanjit                 AAA
## 4   NA Sangeeta                  70
## 5   34     Rhea                  35
## 6   20   Rehana                 AAA
## 7   NA     Anil                  90
## 8   25     Asif                 100
## 9   27   Sanjay                <NA>
## 10  26     <NA>                  55
## 11  12      Raj                  40
## 12  30     Rahu                  45
## 13  27   Sanjay                  65
## 14  15     <NA>                  34
## 15  16     Rija                  32
## 16  30  Dominic                  25
## 17  32     Liza                  50
## 18  21     Maya                  60
## 19  23    Reena                  51

complete.cases(data)            #identify rows with missing values as False

##  [1]  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE
## [13]  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE

data[!complete.cases(data),]        #Display rows with missing values

##    Exp     Name Income.in.thousands
## 4   NA Sangeeta                  70
## 7   NA     Anil                  90
## 9   27   Sanjay                <NA>
## 10  26     <NA>                  55
## 14  15     <NA>                  34

class(data$Income.in.thousands)     #Shows class as character due to presence of some character values

## [1] "character"

#Converting back to numeric, converts “AAA” to NA
data$Income.in.thousands = as.numeric(data$Income.in.thousands)

## Warning: NAs introduced by coercion

Imputation of missing values (try the following in R):

Replacing by mean:

#Replacing missing values with mean experience
expMean = mean(data$Exp, na.rm=T)                    #Mean of experience
Inx = is.na(data$Exp)                            #Indices of missing experience
data$Exp[Inx] = expMean                          #Replacing with mean   


#Replacing missing values with mean income
incMean = mean(data$Income.in.thousands, na.rm=T)            #Mean of Income
Inx2 = is.na(data$Income.in.thousands)                   #Indices of missing income
data$Income.in.thousands[Inx2] = incMean                 #Replacing with mean

Replacing by median:

#Replacing missing values with median experience
expMed = median(data$Exp, na.rm=T)                   #Median of experience
Inx = is.na(data$Exp)                            #Indices of missing experience
data$Exp[Inx] = expMed                       #Replacing with median

Exercise: Write a function to replace missing values (mean & median), use it for income.

Replacing by mode:

#Replacing missing values with mode of name
T = table(data$Name); T                 #Name as a table with counts

## 
##     Ajay     Anil     Asif  Dominic     Liza     Maya  Raanjit     Rahu 
##        1        1        1        1        1        1        1        1 
##      Raj    Reena   Rehana     Rhea     Rija Sangeeta   Sanjay 
##        2        1        1        1        1        1        2

which.max(T)                        #Name repeated maximum times

## Raj 
##   9

Mode = names(which.max(T))              #Mode of name   
Mode

## [1] "Raj"

Inx = is.na(data$Name)                  #Indices of missing names
data$Name[Inx] = Mode               #Replacing with mode
data$Name

##  [1] "Raj"      "Ajay"     "Raanjit"  "Sangeeta" "Rhea"     "Rehana"  
##  [7] "Anil"     "Asif"     "Sanjay"   "Raj"      "Raj"      "Rahu"    
## [13] "Sanjay"   "Raj"      "Rija"     "Dominic"  "Liza"     "Maya"    
## [19] "Reena"

#But, there could be multiple character values with max count, as in our data:

data$Name[Inx] = NA #To get the original variable with missing values for the imputation below
T = table(data$Name); T

## 
##     Ajay     Anil     Asif  Dominic     Liza     Maya  Raanjit     Rahu 
##        1        1        1        1        1        1        1        1 
##      Raj    Reena   Rehana     Rhea     Rija Sangeeta   Sanjay 
##        2        1        1        1        1        1        2

mode_ind = as.vector(T) == max(as.vector(T))        #Indices of all maximum counts
mode_list = names(T)[mode_ind]
mode_list                   #There are two, “Raj” and “Sanjay”

## [1] "Raj"    "Sanjay"

Inx = is.na(data$Name)              #Indices of missing names
data$Name[Inx] = mode_list [2]      #Replacing with mode of your choice
data$Name

##  [1] "Raj"      "Ajay"     "Raanjit"  "Sangeeta" "Rhea"     "Rehana"  
##  [7] "Anil"     "Asif"     "Sanjay"   "Sanjay"   "Raj"      "Rahu"    
## [13] "Sanjay"   "Sanjay"   "Rija"     "Dominic"  "Liza"     "Maya"    
## [19] "Reena"

Normalizing/Standardization (try the following in R):

Normalizing income

inc_s = data$Income.in.thousands 
inc_s  = (inc_s - mean(inc_s))/sd(inc_s)
inc_s

##  [1] -0.70066492  0.35694251  0.00000000  0.88574622 -0.96506678  0.00000000
##  [7]  1.94335365  2.47215737  0.00000000  0.09254065 -0.70066492 -0.43626307
## [13]  0.62134437 -1.01794715 -1.12370790 -1.49387050 -0.17186121  0.35694251
## [19] -0.11898084

Normalizing experience

exp_s = data$Exp
exp_s  = (exp_s - mean(exp_s))/sd(exp_s)
exp_s

##  [1] -1.8193851  1.0898297  0.1711303  0.0000000  1.5491794 -0.5944526
##  [7]  0.0000000  0.1711303  0.4773634  0.3242468 -1.8193851  0.9367131
## [13]  0.4773634 -1.3600354 -1.2069188  0.9367131  1.2429462 -0.4413360
## [19] -0.1351029

Before normalizing the data, replace or delete the missing values

Exercise: Try the following: Write a function that normalizes a numerical variable. If you wish, try creating a function that normalizes all the numerical variables in a dataset (function should take dataset itself as input, considering the dataset has only numerical variables, and outputs normalized dataset). Hint: To get a variable of dataframe, data[, i]; i = 1 to n, n is the number of variables; use dim() for n; data[, -2] removes “Name” variable from data used above.

Handling Outliers (try the following in R):

set.seed(1)                             #For reproducibility
X =  sample(seq(.1,2, by=.2), 20, replace=TRUE)             #Sample data

X = c(X, 3.5)                           #Adding an outlier
#It is just an example, in real data you don’t add an outlier, you rather identify it

boxplot(X)                  #Plot the data and identify the outlier

boxplot.stats(X)$out                #Display outliers, which is 3.5

## [1] 3.5

Ind = X %in% boxplot.stats(X)$out       #All indices as TRUE & FALSE, Trues are outliers
Ind

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE

Ind = which(Ind)                #Indices of outliers in X
Ind

## [1] 21

Exercise: Try the following:

set.seed(1); X = rnorm(2000,5,1)

Using the above data find out all the outliers.

Click the links below to learn more

Data Cleaning and Preprocessing 2

Business and AI

Search This Blog