-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfitDistr.R
executable file
·126 lines (103 loc) · 4.46 KB
/
fitDistr.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env Rscript
suppressPackageStartupMessages(library("optparse"))
## parse command line arguments
option_list <- list(
make_option(c("-i", "--inFile"), help="input file containing numeric value for fit (first column) (can be stdin)"),
make_option(c("-o", "--outFile"), help="output file"),
make_option(c("-d", "--distribution"), default="pois", help="name of distribution to fit (pois, nbinom, gamma, weibull, nnorm, gumbel, binom, geom, hyper) (if multiple separate them by a comma) (default=%default)"),
make_option(c("-c", "--discrete"), default=T, help="if distribution is discrete or continuous (default: %default)")
)
parser <- OptionParser(usage = "%prog [options]", option_list=option_list)
opt <- parse_args(parser)
## check, if all required arguments are given
if(is.null(opt$inFile) | is.null(opt$outFile)) {
cat("\nProgram: fitDistr.R (R script to fit distributions)\n")
cat("Author: BRIC, University of Copenhagen, Denmark\n")
cat("Version: 1.0\n")
cat("Contact: [email protected]\n");
print_help(parser)
q()
}
## load libraries
suppressPackageStartupMessages(library(fitdistrplus))
suppressPackageStartupMessages(library(plyr))
suppressPackageStartupMessages(library(ismev))
suppressPackageStartupMessages(library(session))
if(identical(opt$inFile, "stdin")==T) {
data <- read.table(file("stdin"))
} else {
data <- read.table(opt$inFile)
}
## create output directory, if does not exist
#dir.create(file.path(opt$outDir), showWarnings = FALSE)
## extract input distribution names
distr=as.vector(unlist(strsplit(opt$distr, ",")))
if(nrow(data)>=10) {
## analyze each model fit
outPdf <- sprintf("%s_PLOTS.pdf", opt$outFile)
pdf(outPdf)
## check as to which distribution is the best
descdist(as.integer(data[,1]))
## fit each input distribution to the input numeric values
model <- lapply(distr, function(x) { fitdist(as.integer(data[,1]), x, method="mme") })
for (i in 1:length(model)) {
if(distr[i]=="pois") {
param <- as.vector(coef(model[[i]]))
plot(model[[i]])
data$pois <- laply(as.integer(data[,1]), function(x) { ppois(x, lambda = param[1], lower.tail = F) })
## write AIC value
outFile <- sprintf("%s_AIC", opt$outFile)
write(sprintf("POIS: %s", model[[i]]$aic), outFile, append=T)
}
else if(distr[i]=="nbinom") {
param <- as.vector(coef(model[[i]]))
plot(model[[i]])
data$pnorm <- laply(as.integer(data[,1]), function(x) { pnbinom(x, size = param[1], mu=param[2], lower.tail = F) })
## write AIC value
outFile <- sprintf("%s_AIC", opt$outFile)
write(sprintf("NBINOM: %s", model[[i]]$aic), outFile, append=T)
}
else if(distr[i]=="geom") {
param <- as.vector(coef(model[[i]]))
plot(model[[i]])
data$geom <- laply(as.integer(data[,1]), function(x) { pgeom(x, param[1], lower.tail = F) })
## write AIC value
outFile <- sprintf("%s_AIC", opt$outFile)
write(sprintf("GEOM: %s", model[[i]]$aic), outFile, append=T)
}
}
#gofstat(model[[i]], fitnames=distr)
cdfcomp(model, legendtext=distr)
#qqcomp(model, legendtext=distr)
dev.off()
## HOW TO EVALUATE WHICH MODEL IS BETTER
# one having lower AIC value (model[[i]]$aic
# more details: http://stats.stackexchange.com/questions/132652/how-to-determine-which-distribution-fits-my-data-best
} else {
data$pois <- 1
}
## write output file containing pvalue information
outTable <- sprintf("%s", opt$outFile)
write.table(data[order(-data[,1]),], outTable, sep="\t", row.names = F, col.names = F, quote = F)
## write output session file
outSession <- sprintf("%s_SESSION", opt$outFile)
save.session(outSession)
q()
## MISCELLANEOUS INFO ON DISTRIBUTION FITTING
library(ismev)
t <- gum.fit(y)
gum.diag(t)
t <-fitdistr(as.integer(y), densfun="negative binomial")
pnbinom(50, size = 0.18, mu=14.99, lower.tail = F)
t <-fitdistr(as.integer(y), densfun="poisson")
ppois(20, lambda=as.numeric(as.vector(t)[1]$estimate[1]), lower.tail = F)
set.seed(1)
x = seq(-2, 8, .01)
y = rnbinom(length(x), mu=exp(x), size=1)
fit = glm.nb(y ~ 1)
pnbinom(0.00001, size=fit$theta, prob=0.05)
predicted.y = predict(fit, newdata=data.frame(x=5), type="response")
dnbinom(100, mu=5000, size=fit$theta)
prob = function(newx, newy, fit) {
dnbinom(newy, mu=predict(fit, newdata=data.frame(x=newx), type="response"), size=fit$theta)
}