options(repr.plot.width=4, repr.plot.height=4)
library(igraph)
library(VGAM)
library(ggplot2)
#text <- tolower(paste(readLines('gutenberg/chesterton-brown.txt',
# encoding='UTF-8'), collapse = " "))
text <- tolower(paste(readLines('lutherbibel.txt',
encoding='UTF-8'), collapse = " "))
words <- unlist(strsplit(text, "\\s+"))
words <- words[words != '']
word.frequencies <- data.frame(table(words))
word.frequencies <- word.frequencies[order(word.frequencies$Freq, decreasing=T),]
n <- nrow(word.frequencies)
rownames(word.frequencies) <- 1:n
word.frequencies$rank <- 1:n
word.frequencies[1:30,]
plot(Freq~rank, word.frequencies,
log='xy')
tail(word.frequencies, 20)
x <- round(exp(rnorm(10000)))
plot(sort(as.numeric(table(x)), decreasing=T), log='xy')
ggplot(data.frame(x=x), aes(x=x, y=..count..)) +
geom_histogram() +
scale_x_log10() + scale_y_log10()
ggplot(word.frequencies, aes(x=Freq,y=..count..)) +
geom_histogram(fill='white', col='black') +
scale_x_log10() +
scale_y_log10()
d <- read.csv('staedte.csv')
plot(sort(d$Bevölkerung, decreasing=T), log='xy')
ggplot(d, aes(x=Bevölkerung, y=..count..)) +
geom_histogram(fill='white', col='black') +
scale_x_log10() +
scale_y_log10()
(fit <- fit_power_law(d$Bevölkerung))
alpha <- 2
xmin <- 10
x <- xmin*(runif(100000)^(1/(1-alpha)))
ggplot(data.frame(x=x), aes(x=x, y=..count..))+
geom_histogram() +
scale_x_log10() + scale_y_log10()
plot(sort(x, decreasing=T), log='xy')
plot(zeta(2, shift=1:100))
simulatePL <- function(alpha, xmin) {
g <- function(x) {
(x^{1-alpha} - (x+1)^(1-alpha))/xmin^(1-alpha)
}
f <- function(x) {
(x^-alpha)/zeta(alpha, shift=xmin)
}
C <- f(xmin)/g(xmin)
weiter <- TRUE
while (weiter) {
x <- floor(xmin*runif(1)^(1/(1-alpha)))
if (runif(1) < f(x)/(C*g(x))) {
weiter <- FALSE
}
}
return(x)
}
x <- replicate(10000, simulatePL(2, 1))
ggplot(data.frame(x=x), aes(x=x)) + geom_histogram() +
scale_x_log10() + scale_y_log10()
fit_power_law(x)