3. Descriptive statistics¶

[1]:

freqlist <- read.table("https://bit.ly/35xnbuF", header=TRUE)
head(freqlist)

A data.frame: 6 × 2
	WORD	FREQUENCY
	<fct>	<int>
1	said	12704
2	know	10202
3	got	8825
4	get	6756
5	go	6427
6	think	5961

[2]:

# plot
plot(freqlist$FREQUENCY,
     xlab="index of word types",
     ylab="frequency",
     main="plot of a frequency list",
     cex=0.6)

../_images/basics_03_descriptive_statistics_2_0.png

[3]:

# change type and line width lwd
plot(freqlist$FREQUENCY,
     type="l",
     lwd=2,
     xlab="index of word types",
     ylab="frequency",
     cex=0.6)

../_images/basics_03_descriptive_statistics_3_0.png

[4]:

# show first 20 and plot label
plot(freqlist$FREQUENCY[1:20],
     xlab="index of word types",
     ylab="frequency",
     col="white")
text(freqlist$FREQUENCY[1:20],
     labels = freqlist$WORD[1:20],
     cex=0.7)

../_images/basics_03_descriptive_statistics_4_0.png

[5]:

# together
plot(freqlist$FREQUENCY[1:20],
     xlab="index of word types",
     ylab="frequency",
     type="l",
     col="lightgrey")
text(freqlist$FREQUENCY[1:20],
     labels = freqlist$WORD[1:20],
     cex=0.7)

../_images/basics_03_descriptive_statistics_5_0.png

The distribution at work is known as Zipfian. It is named after Zipf’s law many rare events coexist with very few large events. The resulting curve continually decreases from its peak (although, strictly speaking, this is not a peak).

[7]:

# now with tidyverse
options(warn=-1)
options(messages=-1)
options(tidyverse.quiet = TRUE)
library(tidyverse)

[9]:

ggplot(freqlist[1:20,], aes(x = reorder(WORD, -FREQUENCY), y = FREQUENCY, group=1, label=WORD)) +
    geom_line() + # declare labels + use geom_text()
    xlab("with ggplot") + # we get rid of the xlab tags
    theme_bw()

../_images/basics_03_descriptive_statistics_8_0.png

[10]:

# no line but with text label
ggplot(freqlist[1:20,], aes(x = reorder(WORD, -FREQUENCY), y = FREQUENCY, group=1, label=WORD)) +
    geom_text(check_overlap = TRUE) + # declare labels + use geom_text()
    xlab("") +
    theme_bw()

../_images/basics_03_descriptive_statistics_9_0.png

[11]:

# bar plot, the las argument allows you to decide if the labels are parallel (las=0) or perpendic- ular (las=2) to the x-axis.
barplot(freqlist$FREQUENCY[1:20],names.arg = freqlist$WORD[1:20], las=2)

../_images/basics_03_descriptive_statistics_10_0.png

[12]:

# ggplot
ggplot(freqlist[1:20,], aes(x = reorder(WORD, -FREQUENCY), y = FREQUENCY)) +
  geom_col() +
  xlab("WORD") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

../_images/basics_03_descriptive_statistics_11_0.png

[15]:

hist(freqlist$FREQUENCY[1:100], xlab="frequency bins", las=2, main="")
ggplot(freqlist[1:100,], aes(x=FREQUENCY)) +
  stat_bin(binwidth = 400) +
  theme_bw()

../_images/basics_03_descriptive_statistics_12_0.png

../_images/basics_03_descriptive_statistics_12_1.png

[18]:

data <- readRDS(url("https://bit.ly/2TRdVuM"))

[19]:

# To visualize where the mean stands in your dat
par(mfrow=c(1,2))

# first plot
plot(data$SPLIT_INFINITIVE, xlab="decades", ylab="frequency counts", main="split infinitive")
abline(h = mean(data$SPLIT_INFINITIVE), col="blue")
text(5, mean(data$SPLIT_INFINITIVE)+15, "mean", col="blue")

# second plot
plot(data$UNSPLIT_INFINITIVE, xlab="decades", ylab="frequency counts", main="unsplit infinitive")
abline(h = mean(data$UNSPLIT_INFINITIVE), col="green")
text(5, mean(data$UNSPLIT_INFINITIVE)+20, "mean", col="green")

../_images/basics_03_descriptive_statistics_14_0.png

[20]:

# median and mean
par(mfrow=c(1,2))

# first plot
plot(data$SPLIT_INFINITIVE, xlab="decades", ylab="frequency counts", main="split infinitive")
abline(h = mean(data$SPLIT_INFINITIVE), col="blue")
abline(h = median(data$SPLIT_INFINITIVE), col="blue", lty=3)
text(5, mean(data$SPLIT_INFINITIVE)+15, "mean", col="blue")
text(5, median(data$SPLIT_INFINITIVE)+15, "median", col="blue")

# second plot
plot(data$UNSPLIT_INFINITIVE, xlab="decades", ylab="frequency counts", main="unsplit infinitive")
abline(h = mean(data$UNSPLIT_INFINITIVE), col="green")
abline(h = median(data$UNSPLIT_INFINITIVE), col="green", lty=3)
text(5, mean(data$UNSPLIT_INFINITIVE)+20, "mean", col="green")
text(5, median(data$UNSPLIT_INFINITIVE)+20, "median", col="green")

../_images/basics_03_descriptive_statistics_15_0.png

[22]:

# mode
df.each.every <- read.delim("https://bit.ly/3oSdIWn", header=TRUE)

df <- df.each.every %>%
  count(NP_tag) %>%
  rename(count = n)

ggplot(df, aes(x=NP_tag, y=count)) +
  geom_bar(stat = "identity")

../_images/basics_03_descriptive_statistics_16_0.png

[23]:

data.tidy <- data %>%
  pivot_longer(
    !(DECADE), # all the columns except 'DECADE' are concerned
    names_to = "construction", # new column
    values_to = "count", # where the counts will appear
    values_drop_na = TRUE # do not include NA values (providing NA values appear)
  )

head(data.tidy) # i
p <- ggplot(data.tidy, aes(construction, count))
p + geom_boxplot()

A tibble: 6 × 3
DECADE	construction	count
<int>	<chr>	<int>
1810	SPLIT_INFINITIVE	1812
1810	UNSPLIT_INFINITIVE	1851
1820	SPLIT_INFINITIVE	1821
1820	UNSPLIT_INFINITIVE	2295
1830	SPLIT_INFINITIVE	1836
1830	UNSPLIT_INFINITIVE	2726

../_images/basics_03_descriptive_statistics_17_1.png

[24]:

# colors
p + geom_boxplot(aes(colour = construction))

../_images/basics_03_descriptive_statistics_18_0.png

[25]:

# with data points
p + geom_boxplot(aes(colour = construction), outlier.shape = NA) + geom_jitter(width = 0.1)

../_images/basics_03_descriptive_statistics_19_0.png