# R Visualizations – ggplot2  (PART-2)

• By
• September 25, 2019
• Big Data

# R Visualizations- Part 2

R Visualizations – ggplot2  (PART-2)

1. Distribution

Study of how and where data points are distributed is very important in large amount of data.

Histogram

Histogram plot  – continuous variable

library(ggplot2)

theme_set(theme_classic())

graph <- ggplot(mpg, aes(displ)) + scale_fill_brewer(palette = “Spectral”)

graph + geom_histogram(aes(fill=class), binwidth = .1, col=”black”,  size=.1) +

labs(title=”Histogram with Auto Binning”, subtitle=”Engine Displacement across Vehicle Classes”)

g raph+ geom_histogram(aes(fill=class), bins=5,  col=”black”, size=.1) +

labs(title=”Histogram with Fixed Bins”,  subtitle=”Engine Displacement across Vehicle Classes”)

Histogram plot  — categorical variable

library(ggplot2)

theme_set(theme_classic())

graph <- ggplot(mpg, aes(manufacturer))

graph + geom_bar(aes(fill=class), width = 0.5) +

theme(axis.text.x = element_text(angle=65, vjust=0.6)) +

labs(title=”Histogram on Categorical Variable”,

subtitle=”Manufacturer across Vehicle Classes”)

Density plot

library(ggplot2)

theme_set(theme_classic())

graph <- ggplot(mpg, aes(cty))

graph + geom_density(aes(fill=factor(cyl)), alpha=0.8) +

labs(title=”Density plot”, subtitle=”City Mileage Grouped by Number of cylinders”,

caption=”Source: mpg”, x=”City Mileage”, fill=”# Cylinders”)

Box Plot

To study the data distribution Box plot is an excellent tool. It will show the basic summary information through plot.

library(ggplot2)

theme_set(theme_classic())

graph <- ggplot(mpg, aes(class, cty))

graph + geom_boxplot(varwidth=T, fill=”plum”) +

labs(title=”Box plot”,subtitle=”City Mileage grouped by Class of vehicle”,

caption=”Source: mpg”, x=”Class of Vehicle”, y=”City Mileage”)

library(ggthemes)

graph <- ggplot(mpg, aes(class, cty))

graph + geom_boxplot(aes(fill=factor(cyl))) +  theme(axis.text.x = element_text(angle=65, vjust=0.6)) +

labs(title=”Box plot”,  subtitle=”City Mileage grouped by Class of vehicle”, caption=”Source: mpg”,

x=”Class of Vehicle”,  y=”City Mileage”)

Dot + Box Plot

library(ggplot2)

theme_set(theme_bw())

graph<- ggplot(mpg, aes(manufacturer, cty))

graph + geom_boxplot() + geom_dotplot(binaxis=’y’,stackdir=’center’, dotsize = .5,fill=”red”) +

theme(axis.text.x = element_text(angle=65, vjust=0.6)) +

labs(title=”Box plot + Dot plot”,subtitle=”City Mileage vs Class: Each dot represents 1 row in source data”,

caption=”Source: mpg”,  x=”Class of Vehicle”,y=”City Mileage”)

Tufte Boxplot

library(ggthemes)

library(ggplot2)

theme_set(theme_tufte())

graph <- ggplot(mpg, aes(manufacturer, cty))

graph + geom_tufteboxplot() +

theme(axis.text.x = element_text(angle=65, vjust=0.6)) +

labs(title=”Tufte Styled Boxplot”, subtitle=”City Mileage grouped by Class of vehicle”,

caption=”Source: mpg”, x=”Class of Vehicle”, y=”City Mileage”)

Violin Plot

library(ggplot2)

theme_set(theme_bw())

graph <- ggplot(mpg, aes(class, cty))

graph + geom_violin() +  labs(title=”Violin plot”, subtitle=”City Mileage vs Class of vehicle”,

caption=”Source: mpg”, x=”Class of Vehicle”, y=”City Mileage”)

Population Pyramid

library(ggplot2)

library(ggthemes)

options(scipen = 999)

brks <- seq(-15000000, 15000000, 5000000)

lbls = paste0(as.character(c(seq(15, 0, -5), seq(5, 15, 5))), “m”)

ggplot(email_campaign, aes(x = Stage, y = Users, fill = Gender)) +

geom_bar(stat = “identity”, width = .6) +    scale_y_continuous(breaks = brks, labels = lbls) +

coord_flip() +   labs(title=”Email Campaign “) +  theme_tufte() +

theme(plot.title = element_text(hjust = .5),

axis.ticks = element_blank()) +

scale_fill_brewer(palette = “Dark2”)

Violin Plot

1. Composition

Waffle Chart

v <- mpg\$class

nr <- 10

dfrm <- expand.grid(y = 1:nr, x = 1:nr)

table <- round(table(v) * ((nr*nr)/(length(v))))

table

dfrm\$category <- factor(rep(names(table), table))

ggplot(dfrm, aes(x = x, y = y, fill = category)) +

geom_tile(color = “black”, size = 0.5) +

scale_x_continuous(expand = c(0, 0)) +

scale_y_continuous(expand = c(0, 0), trans = ‘reverse’) +

scale_fill_brewer(palette = “Set3”) +

labs(title=”Waffle Chart”, subtitle=”‘Class’ of vehicles”,

caption=”Source: mpg”) +

theme(panel.border = element_rect(size = 2),

plot.title = element_text(size = rel(1.2)),

axis.text = element_blank(),

axis.title = element_blank(),

axis.ticks = element_blank(),

legend.title = element_blank(),

legend.position = “right”)

Pie Chart

library(ggplot2)

theme_set(theme_classic())

data <- as.data.frame(table(mpg\$class))

colnames(data) <- c(“class”, “freq”)

piechart <- ggplot(data, aes(x = “”, y=freq, fill = factor(class))) +

geom_bar(width = 1, stat = “identity”) +

theme(axis.line = element_blank(),

plot.title = element_text(hjust=0.5)) +

labs(fill=”class”, x=NULL,y=NULL, title=”Pie Chart of class”, caption=”Source: mpg”)

piechart + coord_polar(theta = “y”, start=0)

piechart <- ggplot(mpg, aes(x = “”, fill = factor(class))) +

geom_bar(width = 1) +  theme(axis.line = element_blank(),plot.title = element_text(hjust=0.5)) +

labs(fill=”class”, x=NULL, y=NULL,title=”Pie Chart of class”,caption=”Source: mpg”)

piechart + coord_polar(theta = “y”, start=0)

Violin Plot

Treemap

library(ggplot2)

library(treemapify)

treeMap <- treemapify(langs,area = “value”,fill = “parent”,label = “id”,group = “parent”)

treePlot <- ggplotify(treeMap) +

scale_x_continuous(expand = c(0, 0)) +

scale_y_continuous(expand = c(0, 0)) +

scale_fill_brewer(palette = “Dark2”)

print(treePlot)

Bar Chart

freq <- table(mpg\$manufacturer)

data <- as.data.frame.table(freq)

library(ggplot2)

theme_set(theme_classic())

graph <- ggplot(data, aes(Var1, Freq))

graph + geom_bar(stat=”identity”, width = 0.5, fill=”tomato2″) +

labs(title=”Bar Chart”, subtitle=”Manufacturer of vehicles”,

caption=”Source: Frequency of Manufacturers from ‘mpg’ dataset”) +

theme(axis.text.x = element_text(angle=65, vjust=0.6))

graph <- ggplot(mpg, aes(manufacturer))

graph + geom_bar(aes(fill=class), width = 0.5) +

theme(axis.text.x = element_text(angle=65, vjust=0.6)) +

labs(title=”Categorywise Bar Chart”, subtitle=”Manufacturer of vehicles”,

caption=”Source: Manufacturers from ‘mpg’ dataset”)

1. Change

library(ggplot2)

library(ggfortify)

theme_set(theme_classic())

autoplot(AirPassengers) + labs(title=”AirPassengers”) + theme(plot.title = element_text(hjust=0.5))

library(ggplot2)

theme_set(theme_classic())

ggplot(economics, aes(x=date)) +  geom_line(aes(y=returns_perc)) +

labs(title=”Time Series Chart”, subtitle=”Returns Percentage from ‘Economics’ Dataset”,

caption=”Source: Economics”,  y=”Returns %”)

library(ggplot2)

library(lubridate)

theme_set(theme_bw())

economic <- economics[1:24, ]

lbels <- paste0(month.abb[month(economic\$date)], ” “, lubridate::year(economic\$date))

breks <- economic\$date

ggplot(economic, aes(x=date)) +  geom_line(aes(y=returns_perc)) +

labs(title=”Monthly Time Series”, subtitle=”Returns Percentage from Economics Dataset”,

caption=”Source: Economics”, y=”Returns %”) +  scale_x_date(labels = lbels, breaks = breks) +

theme(axis.text.x = element_text(angle = 90, vjust=0.5), panel.grid.minor = element_blank())

library(ggplot2)

library(lubridate)

theme_set(theme_bw())

economic <- economics[1:90, ]

breks <- economic\$date[seq(1, length(economic\$date), 12)]

lbels <- lubridate::year(breks)

ggplot(economic, aes(x=date)) + geom_line(aes(y=returns_perc)) +

labs(title=”Yearly Time Series”,  subtitle=”Returns Percentage from Economics Dataset”,

caption=”Source: Economics”, y=”Returns %”) +

scale_x_date(labels = lbels,breaks = breks) +

theme(axis.text.x = element_text(angle = 90, vjust=0.5),

panel.grid.minor = element_blank())

data(economics_long, package = “ggplot2”)

library(ggplot2)

library(lubridate)

theme_set(theme_bw())

daf <- economics_long[economics_long\$variable %in% c(“psavert”, “uempmed”), ]

daf <- df[lubridate::year(daf\$date) %in% c(1967:1981), ]

breks <- daf\$date[seq(1, length(daf\$date), 12)]

lbels <- lubridate::year(breks)

ggplot(daf, aes(x=date)) +

geom_line(aes(y=value, col=variable)) +

labs(title=”Time Series of Returns Percentage”,

subtitle=”Drawn from Long Data format”,

caption=”Source: Economics”, y=”Returns %”,

color=NULL) +  scale_x_date(labels = lbels, breaks = breks) +

scale_color_manual(labels = c(“psavert”, “uempmed”),

values = c(“psavert”=”#00ba38”, “uempmed”=”#f8766d”)) +

theme(axis.text.x = element_text(angle = 90, vjust=0.5, size = 8),

panel.grid.minor = element_blank())

library(ggplot2)

library(lubridate)

theme_set(theme_bw())

daf <- economics[, c(“date”, “psavert”, “uempmed”)]

daf <- daf[lubridate::year(daf\$date) %in% c(1967:1981), ]

breks <- daf\$date[seq(1, length(daf\$date), 12)]

lbels <- lubridate::year(breks)

ggplot(daf, aes(x=date)) +

geom_line(aes(y=psavert, col=”psavert”)) +

geom_line(aes(y=uempmed, col=”uempmed”)) +

labs(title=”Time Series of Returns Percentage”,

subtitle=”Drawn From Wide Data format”,

caption=”Source: Economics”, y=”Returns %”) +

scale_x_date(labels = lbels, breaks = breks) +

scale_color_manual(name=””,

values = c(“psavert”=”#00ba38”, “uempmed”=”#f8766d”)) +

theme(panel.grid.minor = element_blank())

Stacked Area Chart

library(ggplot2)

library(lubridate)

theme_set(theme_bw())

daf <- economics[, c(“date”, “psavert”, “uempmed”)]

daf <- df[lubridate::year(daf\$date) %in% c(1967:1981), ]

breks <- daf\$date[seq(1, length(daf\$date), 12)]

lbels <- lubridate::year(breks)

ggplot(daf, aes(x=date)) +

geom_area(aes(y=psavert+uempmed, fill=”psavert”)) +

geom_area(aes(y=uempmed, fill=”uempmed”)) +

labs(title=”Area Chart of Returns Percentage”, subtitle=”From Wide Data format”,

caption=”Source: Economics”,  y=”Returns %”) +

scale_x_date(labels = lbels, breaks = breks) +

scale_fill_manual(name=””,values = c(“psavert”=”#00ba38”, “uempmed”=”#f8766d”)) +

theme(panel.grid.minor = element_blank())

Calendar Heatmap

library(ggplot2)

library(plyr)

library(scales)

library(zoo)

daf\$date <- as.Date(daf\$date)

daf <- daf[daf\$year >= 2012, ]

daf\$yearmonth <- as.yearmon(daf\$date)

daf\$yearmonthf <- factor(daf\$yearmonth)

daf <- ddply(daf,.(yearmonthf), transform, monthweek=1+week-min(week))

daf <- daf[, c(“year”, “yearmonthf”, “monthf”, “week”, “monthweek”, “weekdayf”, “VIX.Close”)]

ggplot(daf, aes(monthweek, weekdayf, fill = VIX.Close)) +

geom_tile(colour = “white”) +

facet_grid(year~monthf) +

labs(x=”Week of Month”, y=””, title = “Time-Series Calendar Heatmap”,

subtitle=”Yahoo Closing Price”, fill=”Close”)

Slope Chart

library(dplyr)

theme_set(theme_classic())

sort <- function(daf, x=”year”, y=”value”, group=”group”, method=”tufte”, min.space=0.05) {

id <- match(c(x, y, group), names(daf))

daf <- daf[,id]

names(daf) <- c(“x”, “y”, “group”)

temp <- expand.grid(x=unique(daf\$x), group=unique(daf\$group))

temp <- merge(daf, temp, all.y=TRUE)

daf <- mutate(temp, y=ifelse(is.na(y), 0, y))

require(reshape2)

temp <- dcast(daf, group ~ x, value.var=”y”)

ordr <- order(temp[,2])

temp <- temp[ordr,]

min.sp<- min.sp*diff(range(temp[,-1]))

y <- numeric(nrow(temp))

for (i in 2:nrow(temp)) {

matx <- as.matrix(temp[(i-1):i, -1])

dmin <- min(diff(matx))

y[i] <- ifelse(dmin < min.sp, min.sp – dmin, 0)

}

temp <- cbind(temp, yshift=cumsum(y))

scale <- 1

temp <- melt(temp, id=c(“group”, “yshift”), variable.name=”x”, value.name=”y”)

temp <- transform(temp, ypos=y + scale*y)

return(temp)

}

plot <- function(daf) {

y <- subset(daf, x==head(x,1))\$group

yv <- subset(daf, x==head(x,1))\$ypos

fontS <- 3

ggp <- ggplot(daf,aes(x=x,y=ypos)) +

geom_line(aes(group=group),colour=”grey80″) +

geom_point(colour=”white”,size=8) +

geom_text(aes(label=y), size=fontS, family=”American Typewriter”) +

scale_y_continuous(name=””, breaks=yv, labels=y)

return(ggp)

}

daf <- tufte_sort(source_daf, x=”year”,  y=”value”, group=”group”, method=”tufte”,

min.space=0.05)

daf <- transform(daf, x=factor(x, levels=c(5,10,15,20),

labels=c(“5 years”,”10 years”,”15 years”,”20 years”)), y=round(y))

plot_slopegraph(daf) + labs(title=”Estimates of % survival rates”) +

theme(axis.title=element_blank(),axis.ticks = element_blank(),

plot.title = element_text(hjust=0.5, family = “American Typewriter”, face=”bold”),

axis.text = element_text(family = “American Typewriter”, face=”bold”))

Seasonal Plot

library(ggplot2)

library(forecast)

theme_set(theme_classic())

small <- window(nottem, start=c(1920, 1), end=c(1925, 12))

ggseasonplot(AirPassengers) + labs(title=”Seasonal plot: International Airline Passengers”)

ggseasonplot(nottem_small) + labs(title=”Seasonal plot: Air temperatures at Nottingham Castle”)

1. Groups

library(ggplot2)

library(ggdendro)

theme_set(theme_bw())

hcd <- hclust(dist(USArrests), “ave”)

ggdendrogram(hcd, rotate = TRUE, size = 2)

Clusters

library(ggplot2)

library(ggalt)

library(ggfortify)

theme_set(theme_classic())

daf <- iris[c(1, 2, 3, 4)]

pca_mod <- prcomp(daf)

daf_pc <- data.frame(pca_mod\$x, Species=iris\$Species)

daf_pc_vir <- daf_pc[daf_pc\$Species == “virginica”, ]

daf_pc_set <- daf_pc[daf_pc\$Species == “setosa”, ]

daf_pc_ver <- daf_pc[daf_pc\$Species == “versicolor”, ]

ggplot(daf_pc, aes(PC1, PC2, col=Species)) +

geom_point(aes(shape=Species), size=2) +  labs(title=”Iris Clustering”,

subtitle=”With principal components PC1 and PC2 as X and Y axis”,

caption=”Source: Iris”) +  coord_cartesian(xlim = 1.2 * c(min(daf_pc\$PC1), max(daf_pc\$PC1)),

ylim = 1.2 * c(min(daf_pc\$PC2), max(daf_pc\$PC2))) +

geom_encircle(data = daf_pc_vir, aes(x=PC1, y=PC2)) +

geom_encircle(data = daf_pc_set, aes(x=PC1, y=PC2)) +

geom_encircle(data = daf_pc_ver, aes(x=PC1, y=PC2))

1. Spatial

library(ggplot2)

library(ggmap)

library(ggalt)

ch <-  geocode(“Chennai”)

ch_sat_map <- qmap(“chennai”, zoom=12, source = “google”, maptype=”satellite”)

ch_hybrid_map <- qmap(“chennai”, zoom=12, source = “google”, maptype=”hybrid”)

ch_osm_map <- qmap(“chennai”, zoom=12, source = “osm”)

ch_places <- c(“Kolathur”,

“Washermanpet”,

“Royapettah”,

“Guindy”)

pl_loc <- geocode(chennai_places)

ch_osm_map + geom_point(aes(x=lon, y=lat),

data = pl_loc,

alpha = 0.7,

size = 7,

color = “tomato”) +

geom_encircle(aes(x=lon, y=lat),

data = pl_loc, size = 2, color = “blue”)

ch_road_map + geom_point(aes(x=lon, y=lat),

data = pl_loc,

alpha = 0.7,

size = 7,

color = “tomato”) +

geom_encircle(aes(x=lon, y=lat),

data = pl_loc, size = 2, color = “blue”)

ch_hybrid_map + geom_point(aes(x=lon, y=lat),

data = pl_loc,

alpha = 0.7,

size = 7,

color = “tomato”) +

geom_encircle(aes(x=lon, y=lat),

data = pl_loc, size = 2, color = “blue”)

Sample Plot practice:-

1) Sample_Numbers<-table(mtcars\$cyl,mtcars\$gear)

barplot(Sample_Numbers,main=’Automobile cylinder number

gears’,col=c(‘red’,’orange’,’steelblue’),

legend=rownames(Sample_Numbers),xlab=’Number of Gears’,

ylab=’count’)

2) hist(airquality\$Temp,col=’steelblue’,main=’Maximum Daily Temperature’,xlab=’Temperature (degrees Fahrenheit)’)

3) Sample_x<-rnorm(10,mean=rep(1:5,each=2),sd=0.7)

Sample_y<-rnorm(10,mean=rep(c(1,9),each=5),sd=0.1)

data<-data.frame(x=Sample_x,y=Sample_y)

set.seed(143)

data_Sample<-as.matrix(data)[sample(1:10),]

heatmap(data_Sample)

4) with(subset(airquality,Month==9),plot(Wind,Ozone,col=’steelblue

title(‘Wind and Temperature in NYC in September of 1973’)

5) sample_cars<-transform(sample_cars,cyl=factor(cyl))

class(sample_cars\$cyl)

boxplot(mpg~cyl,sample_cars,xlab=’Number of

Cylinders’,ylab=’miles per gallon’,main=’miles per gallon

for varied cylinders in automobiles’,cex.main=1.2)

6) corr_sample <- cor(sample_cars)

corrplot(corr_sample)

corrplot(corr_sample, method = ‘number’,type = “lower”)

7) airquality %>%

group_by(Day) %>%

summarise(mean_wind = mean(Wind)) %>%

ggplot() +geom_area(aes(x = Day, y = mean_wind)) +

labs(title = “Area Chart of Average Wind per Day”,

subtitle = “using airquality data”,y = “Mean Wind”)

Author:-
Rahul Pund

## Call the Trainer and Book your free demo Class now!!!

© Copyright 2019 | Sevenmentor Pvt Ltd.