Open In Colab

Introduction to R

What is R

R is a language and environment for statistical computing and graphics.

  • Runs on a variaty of operation systerms: Windows, Linux and MacOS

  • Generates publication-ready plots

  • Owns a large open-source community

  • Extends functions as packages

Essetnial concepts for programming languages

Variables

Variable names can have letters, dots and undercores (e.g. gene_name, "csvfile.1" and chr1).

Functions

A function is a structured, reusable segment of code designed to carry out a specific set of operations. It can accept zero or more inputs (parameters) and can produce an output (result).

INPUT --function--> OUTPUT

The way to define a function is:

read_star_file <- function(file_path){
    ...code goes here...
    return(df)
}

The way to use a function in R is:

read_star_file(paths)

Basic data types

# assign a number to variable gene_count
gene1 <- 150
gene2 <- 200
gene1
gene2

150

200

# Examples of character value
gene_name1 <- "KRAS"
gene_name1

'KRAS'

# Examples of character value
"RAS" -> gene_name2
gene_name2

'RAS'

# Examples of character value
gene_name3 <- "KRAS"
gene_name3

'KRAS'

# Logical value
gene1 > gene2
gene1 < gene2
bool_val = gene1 < gene2

FALSE

TRUE

class(gene1)
class(gene_name)
class()

'numeric'

'character'

Basic data structure

An atomic vector is a collection of multiple values (numeric, character, or logical) stored in a single object. You can create an atomic vector using the c() function.

sample_names <- c("N2_day1_rep1",   "N2_day1_rep2", "N2_day1_rep3",
                  "N2_day7_rep1", "N2_day7_rep2", "N2_day7_rep3")
sample_names
  1. 'N2_day1_rep1'
  2. 'N2_day1_rep2'
  3. 'N2_day1_rep3'
  4. 'N2_day7_rep1'
  5. 'N2_day7_rep2'
  6. 'N2_day7_rep3'
group <- gsub("_rep\\d", "", sample_names)
group
  1. 'N2_day1'
  2. 'N2_day1'
  3. 'N2_day1'
  4. 'N2_day7'
  5. 'N2_day7'
  6. 'N2_day7'
coldata_df <- cbind(group = gsub("_rep\\d", "", sample_names))
coldata_df
A matrix: 6 × 1 of type chr
group
N2_day1
N2_day1
N2_day1
N2_day7
N2_day7
N2_day7
rownames(coldata_df) = sample_names
coldata_df
A matrix: 6 × 1 of type chr
group
N2_day1_rep1N2_day1
N2_day1_rep2N2_day1
N2_day1_rep3N2_day1
N2_day7_rep1N2_day7
N2_day7_rep2N2_day7
N2_day7_rep3N2_day7
t(coldata_df)
A matrix: 1 × 6 of type chr
N2_day1_rep1N2_day1_rep2N2_day1_rep3N2_day7_rep1N2_day7_rep2N2_day7_rep3
groupN2_day1N2_day1N2_day1N2_day7N2_day7N2_day7
is.matrix(coldata_df)
coldata_df <- as.data.frame(coldata_df)
is.data.frame(coldata_df)

TRUE

TRUE

A matrix can contain either character or numeric columns and a dataframe can contain both numeric and character columns.

A list is an ordered collection of objects, which can be any type of R objects (vectors, matrices, data frames, even lists).

count_files = list("sample" = 'N2_day1_rep1.ReadsPerGene.out.tab', "sample2"='N2_day1_rep2.ReadsPerGene.out.tab')
count_files
$sample
'N2_day1_rep1.ReadsPerGene.out.tab'
$sample2
'N2_day1_rep2.ReadsPerGene.out.tab'
gene_count = list("gene1" = 10, "gene2"=20)
lapply(gene_count, function(x){log2(x+1)})
$gene1
3.4594316186373
$gene2
4.39231742277876
log2_transform <- function(x){
    log2(x+1)
}
lapply(gene_count, log2_transform)
$gene1
3.4594316186373
$gene2
4.39231742277876

Dealing with text files

getwd()
#setwd()

'/content'

list.files()
  1. 'N2_day1_rep1.ReadsPerGene.out.tab'
  2. 'N2_day1_rep2.ReadsPerGene.out.tab'
  3. 'N2_day1_rep3.ReadsPerGene.out.tab'
  4. 'N2_day7_rep1.ReadsPerGene.out.tab'
  5. 'N2_day7_rep2.ReadsPerGene.out.tab'
  6. 'N2_day7_rep3.ReadsPerGene.out.tab'
  7. 'sample_data'
file_paths <- list.files(pattern = "*..ReadsPerGene.out.tab")
file_paths
  1. 'N2_day1_rep1.ReadsPerGene.out.tab'
  2. 'N2_day1_rep2.ReadsPerGene.out.tab'
  3. 'N2_day1_rep3.ReadsPerGene.out.tab'
  4. 'N2_day7_rep1.ReadsPerGene.out.tab'
  5. 'N2_day7_rep2.ReadsPerGene.out.tab'
  6. 'N2_day7_rep3.ReadsPerGene.out.tab'
tab_N2_day1_rep1 <- read.table('N2_day1_rep1.ReadsPerGene.out.tab')
head(tab_N2_day1_rep1, 5)
A data.frame: 5 × 4
V1V2V3V4
<chr><int><int><int>
1N_unmapped 1332776 1332776 1332776
2N_multimapping1540190 1540190 1540190
3N_noFeature 1571021901745518933214
4N_ambiguous 536422 128854 120342
5WBGene00000003 341 161 180
tab_N2_day1_rep2 <- read.table('N2_day1_rep2.ReadsPerGene.out.tab')
head(tab_N2_day1_rep2,5)
A data.frame: 5 × 4
V1V2V3V4
<chr><int><int><int>
1N_unmapped 1400596 1400596 1400596
2N_multimapping1305129 1305129 1305129
3N_noFeature 1521831500978614975925
4N_ambiguous 439830 104631 98489
5WBGene00000003 415 198 217
tab_N2_day1_rep3 <- read.table('N2_day1_rep3.ReadsPerGene.out.tab')
head(tab_N2_day1_rep3,5)
A data.frame: 5 × 4
V1V2V3V4
<chr><int><int><int>
1N_unmapped 5887223 5887223 5887223
2N_multimapping1557570 1557570 1557570
3N_noFeature 1844411761235917574940
4N_ambiguous 514559 122385 115498
5WBGene00000003 411 175 236
library(dplyr)
Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
tab_N2_day1_rep1 <- tab_N2_day1_rep1 %>% select(V1, V2)
head(tab_N2_day1_rep1, 5)
tab_N2_day1_rep2 <- tab_N2_day1_rep2[, c("V1", "V2")]
head(tab_N2_day1_rep2, 5)
tab_N2_day1_rep3 <- tab_N2_day1_rep3[, c("V1", "V2")]
head(tab_N2_day1_rep3, 5)
A data.frame: 5 × 2
V1V2
<chr><int>
1N_unmapped 1332776
2N_multimapping1540190
3N_noFeature 157102
4N_ambiguous 536422
5WBGene00000003 341
A data.frame: 5 × 2
V1V2
<chr><int>
1N_unmapped 1400596
2N_multimapping1305129
3N_noFeature 152183
4N_ambiguous 439830
5WBGene00000003 415
A data.frame: 5 × 2
V1V2
<chr><int>
1N_unmapped 5887223
2N_multimapping1557570
3N_noFeature 184441
4N_ambiguous 514559
5WBGene00000003 411
df_merged <- merge(tab_N2_day1_rep1, tab_N2_day1_rep2, by = "V1")
head(df_merged)
df_merged <- merge(df_merged, tab_N2_day1_rep3, by = "V1")
head(df_merged)
A data.frame: 6 × 3
V1V2.xV2.y
<chr><int><int>
1N_ambiguous 536422 439830
2N_multimapping15401901305129
3N_noFeature 157102 152183
4N_unmapped 13327761400596
5WBGene00000001 3227 2168
6WBGene00000002 270 203
A data.frame: 6 × 4
V1V2.xV2.yV2
<chr><int><int><int>
1N_ambiguous 536422 439830 514559
2N_multimapping154019013051291557570
3N_noFeature 157102 152183 184441
4N_unmapped 133277614005965887223
5WBGene00000001 3227 2168 2589
6WBGene00000002 270 203 266

The Reduce() function in R allows us to apply a function repeatedly to a list of elements. Here, we are applying merge() iteratively to a list of data frames.

df_mer_red <- Reduce(function(x, y) merge(x, y, by = "V1"),
                        list("tab_N2_day1_rep1" = tab_N2_day1_rep1,
                             "tab_N2_day1_rep2" = tab_N2_day1_rep2,
                             "tab_N2_day1_rep3" = tab_N2_day1_rep3))
head(df_mer_red, 5)
A data.frame: 5 × 4
V1V2.xV2.yV2
<chr><int><int><int>
1N_ambiguous 536422 439830 514559
2N_multimapping154019013051291557570
3N_noFeature 157102 152183 184441
4N_unmapped 133277614005965887223
5WBGene00000001 3227 2168 2589

Producing Graphs with R

Producing Graphs using basic

Line charts

# Define the cars vector with 5 values
cars <- c(1, 3, 6, 4, 9)

# Graph the cars vector with all defaults
plot(cars)

png

Let's add a title, a line to connect the points, and some color:

# Define the cars vector with 5 values
cars <- c(1, 3, 6, 4, 9)

# Graph cars using blue points overlayed by a line
plot(cars, type="o", col="blue")

# Create a title with a red, bold/italic font
title(main="Autos", col.main="red", font.main=4)

png

# Define 2 vectors
cars <- c(1, 3, 6, 4, 9)
trucks <- c(2, 5, 4, 5, 12)

# Graph cars using a y axis that ranges from 0 to 12
plot(cars, type="o", col="blue", ylim=c(0,12))

# Graph trucks with red dashed line and square points
lines(trucks, type="o", pch=22, lty=2, col="red")

# Create a title with a red, bold/italic font
title(main="Autos", col.main="red", font.main=4)

png

csv_url <- "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv"
mtcars <- read.csv(csv_url)
mtcars
A data.frame: 32 × 12
modelmpgcyldisphpdratwtqsecvsamgearcarb
<chr><dbl><int><dbl><int><dbl><dbl><dbl><int><int><int><int>
Mazda RX4 21.06160.01103.902.62016.460144
Mazda RX4 Wag 21.06160.01103.902.87517.020144
Datsun 710 22.84108.0 933.852.32018.611141
Hornet 4 Drive 21.46258.01103.083.21519.441031
Hornet Sportabout 18.78360.01753.153.44017.020032
Valiant 18.16225.01052.763.46020.221031
Duster 360 14.38360.02453.213.57015.840034
Merc 240D 24.44146.7 623.693.19020.001042
Merc 230 22.84140.8 953.923.15022.901042
Merc 280 19.26167.61233.923.44018.301044
Merc 280C 17.86167.61233.923.44018.901044
Merc 450SE 16.48275.81803.074.07017.400033
Merc 450SL 17.38275.81803.073.73017.600033
Merc 450SLC 15.28275.81803.073.78018.000033
Cadillac Fleetwood 10.48472.02052.935.25017.980034
Lincoln Continental10.48460.02153.005.42417.820034
Chrysler Imperial 14.78440.02303.235.34517.420034
Fiat 128 32.44 78.7 664.082.20019.471141
Honda Civic 30.44 75.7 524.931.61518.521142
Toyota Corolla 33.94 71.1 654.221.83519.901141
Toyota Corona 21.54120.1 973.702.46520.011031
Dodge Challenger 15.58318.01502.763.52016.870032
AMC Javelin 15.28304.01503.153.43517.300032
Camaro Z28 13.38350.02453.733.84015.410034
Pontiac Firebird 19.28400.01753.083.84517.050032
Fiat X1-9 27.34 79.0 664.081.93518.901141
Porsche 914-2 26.04120.3 914.432.14016.700152
Lotus Europa 30.44 95.11133.771.51316.901152
Ford Pantera L 15.88351.02644.223.17014.500154
Ferrari Dino 19.76145.01753.622.77015.500156
Maserati Bora 15.08301.03353.543.57014.600158
Volvo 142E 21.44121.01094.112.78018.601142
plot(mtcars$wt,mtcars$mpg, main="Scatterplot in Base R",
   xlab="Car Weight", ylab="MPG",
   pch=4, col = "blue", lwd=1, cex = 2)
abline(lm(mtcars$mpg~mtcars$wt), col="red")
text(mtcars$wt, mtcars$mpg, labels=rownames(mtcars), cex=0.5, font=2)

png

hist(mtcars$hp,
      prob = TRUE)
lines(density(mtcars$hp), # density plot
     lwd = 2, # thickness of line
     col = "red")
abline(v=mean(mtcars$hp),
     lty="dashed",
     col="blue")

png

Probability Density: It tells you how the data is distributed over different ranges (or bins) of values. The height of each bar shows how densely the data is packed in that range of horsepower values.

Producing graphs using ggplot2

library(ggplot2)
ggplot(mtcars, aes(x=wt, y=mpg)) +
  geom_point(size=5, shape=4, color="blue", stroke=1) +
  geom_smooth(method=lm, color="red") +
  ggtitle("Scatterplot in ggplot2") +
  xlab("Car Weight") + # for the x axis label
  geom_text(label=rownames(mtcars),cex=3)
`geom_smooth()` using formula = 'y ~ x'

png

ggplot(mtcars, aes(x = hp)) +
  geom_histogram(aes(y = ..density..),
                 bins = 6, # You can adjust the number of bins
                 fill = "lightblue",
                 color = "black") +  # Histogram with probability density
  geom_density(color = "red", size = 1.5) +  # Add the density plot
  geom_vline(aes(xintercept = mean(hp)),
             linetype = "dashed",
             color = "blue",
             size = 1.2) +  # Add dashed blue vertical line at the mean
  labs(title = "Distribution of Horsepower in mtcars Dataset",
       x = "Horsepower",
       y = "Density") +  # Add axis labels and title
  theme_minimal() # Use a clean theme for the plot

png

Reference

https://sites.harding.edu/fmccown/R/

https://jtr13.github.io/cc21fall2/base-r-vs.-ggplot2-visualization.html#base-r-vs.-ggplot2-visualization