From 026079a47d0ce481f5ef546ad10a4c2c4a646bfa Mon Sep 17 00:00:00 2001 From: Michael Reber Date: Mon, 18 Nov 2019 14:03:28 +0100 Subject: [PATCH] Adding R programming language --- ... - vectors, lists, matrices, data frames.r | 31 +++++ r/A histogram with tails shown in red.r | 11 ++ ...g - estimation, diagnostics, forecasting.r | 67 ++++++++++ r/Add Two Vectors.r | 16 +++ ... manner of import and export of datasets.r | 33 +++++ r/An example of simulation-based inference.r | 33 +++++ ...rrays (as in awk) or hashes (as in perl).r | 52 ++++++++ r/Binary to Decimal.r | 16 +++ r/Check Armstrong number.r | 25 ++++ r/Check Leap Year.r | 27 ++++ r/Check Odd and Even Number.r | 21 ++++ r/Check Positive, Negative or Zero.r | 24 ++++ r/Check Prime Number.r | 27 ++++ r/Compute LCM in R.r | 30 +++++ ...ence, as an example, for a sample median.r | 15 +++ r/Doing OLS.r | 44 +++++++ r/Dummy variables in regression.r | 55 +++++++++ r/Fibonacci Sequence in R.r | 38 ++++++ ...nd Factorial of a number using recursion.r | 14 +++ r/Find Minimum and Maximum.r | 33 +++++ r/Find factors of a number.r | 29 +++++ ...d sum of natural numbers without formula.r | 21 ++++ r/Find the factorial of a number.r | 20 +++ r/From Uniform Distribution.r | 8 ++ r/Handling missing data.r | 22 ++++ ...s, marginal distributions, useful tables.r | 43 +++++++ ...ariable' (LSDV) or `fixed effects' model.r | 23 ++++ ...time-series object using the zoo package.r | 31 +++++ ...n PDF files that can be put into a paper.r | 20 +++ r/Multiplication Table.r | 25 ++++ ...t y axis and another with a right y axis.r | 13 ++ r/Prices and returns.r | 99 +++++++++++++++ r/Print Fibonacci Sequence.r | 41 +++++++ r/Program to Find GCD.r | 30 +++++ r/Quartiles deciles tables graphs.r | 52 ++++++++ r/R Hello World Program.r | 11 ++ ...ecification is an argument to a function.r | 12 ++ r/README.md | 3 + ... files, reading and writing binary files.r | 43 +++++++ r/Reading in a Microsoft .r | 15 +++ ...e made by CMIE's Business Beacon program.r | 17 +++ r/Sample From a Population.r | 16 +++ ...of children with the Cauchy distribution.r | 40 ++++++ ...lse, either in email or as a binary file.r | 33 +++++ ...illed colour in a macro time-series plot.r | 18 +++ ...f the mean when compared with the median.r | 26 ++++ r/Simple Calculator in R.r | 48 ++++++++ ...study size and power in a simple problem.r | 53 ++++++++ r/Sort a Vector.r | 25 ++++ r/Special cases in reading files.r | 22 ++++ ...utations with well-studied distributions.r | 30 +++++ r/Standard tests.r | 24 ++++ r/Take input from user.r | 13 ++ r/The amazing R vector notation.r | 116 ++++++++++++++++++ r/To do OLS by MLE.r | 66 ++++++++++ ...do `moving window volatility' of returns.r | 28 +++++ r/To do nonlinear regression.r | 58 +++++++++ r/To do sorting.r | 18 +++ ... table with results of an OLS regression.r | 35 ++++++ r/To make a panel of pictures.r | 33 +++++ r/To make latex tabular out of an R matrix.r | 41 +++++++ ...ple data file where date data is present.r | 24 ++++ ...data file, and look around it's contents.r | 26 ++++ ...g R indexing notation, and the use of is.r | 43 +++++++ r/To write functions.r | 42 +++++++ ...nd a two-sample Kolmogorov-Smirnoff test.r | 44 +++++++ r/Using orthogonal polynomials.r | 61 +++++++++ r/Utilise matrix notation.r | 26 ++++ r/Vector Elements Arithmetic.r | 17 +++ r/z=f(x,y) using contour lines and colours.r | 41 +++++++ 70 files changed, 2257 insertions(+) create mode 100644 r/A first look at R objects - vectors, lists, matrices, data frames.r create mode 100644 r/A histogram with tails shown in red.r create mode 100644 r/ARMA modeling - estimation, diagnostics, forecasting.r create mode 100644 r/Add Two Vectors.r create mode 100644 r/All manner of import and export of datasets.r create mode 100644 r/An example of simulation-based inference.r create mode 100644 r/Associative arrays (as in awk) or hashes (as in perl).r create mode 100644 r/Binary to Decimal.r create mode 100644 r/Check Armstrong number.r create mode 100644 r/Check Leap Year.r create mode 100644 r/Check Odd and Even Number.r create mode 100644 r/Check Positive, Negative or Zero.r create mode 100644 r/Check Prime Number.r create mode 100644 r/Compute LCM in R.r create mode 100644 r/Do bootstrap inference, as an example, for a sample median.r create mode 100644 r/Doing OLS.r create mode 100644 r/Dummy variables in regression.r create mode 100644 r/Fibonacci Sequence in R.r create mode 100644 r/Find Factorial of a number using recursion.r create mode 100644 r/Find Minimum and Maximum.r create mode 100644 r/Find factors of a number.r create mode 100644 r/Find sum of natural numbers without formula.r create mode 100644 r/Find the factorial of a number.r create mode 100644 r/From Uniform Distribution.r create mode 100644 r/Handling missing data.r create mode 100644 r/Joint distributions, marginal distributions, useful tables.r create mode 100644 r/Least squares dummy variable' (LSDV) or `fixed effects' model.r create mode 100644 r/Make a time-series object using the zoo package.r create mode 100644 r/Make pictures in PDF files that can be put into a paper.r create mode 100644 r/Multiplication Table.r create mode 100644 r/Plotting two series on one graph, one with a left y axis and another with a right y axis.r create mode 100644 r/Prices and returns.r create mode 100644 r/Print Fibonacci Sequence.r create mode 100644 r/Program to Find GCD.r create mode 100644 r/Quartiles deciles tables graphs.r create mode 100644 r/R Hello World Program.r create mode 100644 r/R syntax where model specification is an argument to a function.r create mode 100644 r/README.md create mode 100644 r/Reading and writing ascii files, reading and writing binary files.r create mode 100644 r/Reading in a Microsoft .r create mode 100644 r/Reading in a file made by CMIE's Business Beacon program.r create mode 100644 r/Sample From a Population.r create mode 100644 r/Scare the hell out of children with the Cauchy distribution.r create mode 100644 r/Sending an R data object to someone else, either in email or as a binary file.r create mode 100644 r/Show recessions using filled colour in a macro time-series plot.r create mode 100644 r/Show the efficiency of the mean when compared with the median.r create mode 100644 r/Simple Calculator in R.r create mode 100644 r/Simulation to study size and power in a simple problem.r create mode 100644 r/Sort a Vector.r create mode 100644 r/Special cases in reading files.r create mode 100644 r/Standard computations with well-studied distributions.r create mode 100644 r/Standard tests.r create mode 100644 r/Take input from user.r create mode 100644 r/The amazing R vector notation.r create mode 100644 r/To do OLS by MLE.r create mode 100644 r/To do `moving window volatility' of returns.r create mode 100644 r/To do nonlinear regression.r create mode 100644 r/To do sorting.r create mode 100644 r/To make a latex table with results of an OLS regression.r create mode 100644 r/To make a panel of pictures.r create mode 100644 r/To make latex tabular out of an R matrix.r create mode 100644 r/To read in a simple data file where date data is present.r create mode 100644 r/To read in a simple data file, and look around it's contents.r create mode 100644 r/To show amazing R indexing notation, and the use of is.r create mode 100644 r/To write functions.r create mode 100644 r/Two CDFs and a two-sample Kolmogorov-Smirnoff test.r create mode 100644 r/Using orthogonal polynomials.r create mode 100644 r/Utilise matrix notation.r create mode 100644 r/Vector Elements Arithmetic.r create mode 100644 r/z=f(x,y) using contour lines and colours.r diff --git a/r/A first look at R objects - vectors, lists, matrices, data frames.r b/r/A first look at R objects - vectors, lists, matrices, data frames.r new file mode 100644 index 0000000..faa49fa --- /dev/null +++ b/r/A first look at R objects - vectors, lists, matrices, data frames.r @@ -0,0 +1,31 @@ +# Goals: A first look at R objects - vectors, lists, matrices, data frames. + +# To make vectors "x" "y" "year" and "names" +x <- c(2,3,7,9) +y <- c(9,7,3,2) +year <- 1990:1993 +names <- c("payal", "shraddha", "kritika", "itida") +# Accessing the 1st and last elements of y -- +y[1] +y[length(y)] + +# To make a list "person" -- +person <- list(name="payal", x=2, y=9, year=1990) +person +# Accessing things inside a list -- +person$name +person$x + +# To make a matrix, pasting together the columns "year" "x" and "y" +# The verb cbind() stands for "column bind" +cbind(year, x, y) + +# To make a "data frame", which is a list of vectors of the same length -- +D <- data.frame(names, year, x, y) +nrow(D) +# Accessing one of these vectors +D$names +# Accessing the last element of this vector +D$names[nrow(D)] +# Or equally, +D$names[length(D$names)] \ No newline at end of file diff --git a/r/A histogram with tails shown in red.r b/r/A histogram with tails shown in red.r new file mode 100644 index 0000000..589b206 --- /dev/null +++ b/r/A histogram with tails shown in red.r @@ -0,0 +1,11 @@ +# Goal: A histogram with tails shown in red. + +# This happened on the R mailing list on 7 May 2004. +# This is by Martin Maechler , who was +# responding to a slightly imperfect version of this by +# "Guazzetti Stefano" + +x <- rnorm(1000) +hx <- hist(x, breaks=100, plot=FALSE) +plot(hx, col=ifelse(abs(hx$breaks) < 1.669, 4, 2)) + # What is cool is that "col" is supplied a vector. \ No newline at end of file diff --git a/r/ARMA modeling - estimation, diagnostics, forecasting.r b/r/ARMA modeling - estimation, diagnostics, forecasting.r new file mode 100644 index 0000000..681b05b --- /dev/null +++ b/r/ARMA modeling - estimation, diagnostics, forecasting.r @@ -0,0 +1,67 @@ +# Goals: ARMA modeling - estimation, diagnostics, forecasting. + + +# 0. SETUP DATA +rawdata <- c(-0.21,-2.28,-2.71,2.26,-1.11,1.71,2.63,-0.45,-0.11,4.79,5.07,-2.24,6.46,3.82,4.29,-1.47,2.69,7.95,4.46,7.28,3.43,-3.19,-3.14,-1.25,-0.50,2.25,2.77,6.72,9.17,3.73,6.72,6.04,10.62,9.89,8.23,5.37,-0.10,1.40,1.60,3.40,3.80,3.60,4.90,9.60,18.20,20.60,15.20,27.00,15.42,13.31,11.22,12.77,12.43,15.83,11.44,12.32,12.10,12.02,14.41,13.54,11.36,12.97,10.00,7.20,8.74,3.92,8.73,2.19,3.85,1.48,2.28,2.98,4.21,3.85,6.52,8.16,5.36,8.58,7.00,10.57,7.12,7.95,7.05,3.84,4.93,4.30,5.44,3.77,4.71,3.18,0.00,5.25,4.27,5.14,3.53,4.54,4.70,7.40,4.80,6.20,7.29,7.30,8.38,3.83,8.07,4.88,8.17,8.25,6.46,5.96,5.88,5.03,4.99,5.87,6.78,7.43,3.61,4.29,2.97,2.35,2.49,1.56,2.65,2.49,2.85,1.89,3.05,2.27,2.91,3.94,2.34,3.14,4.11,4.12,4.53,7.11,6.17,6.25,7.03,4.13,6.15,6.73,6.99,5.86,4.19,6.38,6.68,6.58,5.75,7.51,6.22,8.22,7.45,8.00,8.29,8.05,8.91,6.83,7.33,8.52,8.62,9.80,10.63,7.70,8.91,7.50,5.88,9.82,8.44,10.92,11.67) + +# Make a R timeseries out of the rawdata: specify frequency & startdate +gIIP <- ts(rawdata, frequency=12, start=c(1991,4)) +print(gIIP) +plot.ts(gIIP, type="l", col="blue", ylab="IIP Growth (%)", lwd=2, + main="Full data") +grid() + +# Based on this, I decide that 4/1995 is the start of the sensible period. +gIIP <- window(gIIP, start=c(1995,4)) +print(gIIP) +plot.ts(gIIP, type="l", col="blue", ylab="IIP Growth (%)", lwd=2, + main="Estimation subset") +grid() + +# Descriptive statistics about gIIP +mean(gIIP); sd(gIIP); summary(gIIP); +plot(density(gIIP), col="blue", main="(Unconditional) Density of IIP growth") +acf(gIIP) + + +# 1. ARMA ESTIMATION +m.ar2 <- arima(gIIP, order = c(2,0,0)) +print(m.ar2) # Print it out + + +# 2. ARMA DIAGNOSTICS +tsdiag(m.ar2) # His pretty picture of diagnostics +## Time series structure in errors +print(Box.test(m.ar2$residuals, lag=12, type="Ljung-Box")); +## Sniff for ARCH +print(Box.test(m.ar2$residuals^2, lag=12, type="Ljung-Box")); +## Eyeball distribution of residuals +plot(density(m.ar2$residuals), col="blue", xlim=c(-8,8), + main=paste("Residuals of AR(2)")) + + +# 3. FORECASTING +## Make a picture of the residuals +plot.ts(m.ar2$residual, ylab="Innovations", col="blue", lwd=2) +s <- sqrt(m.ar2$sigma2) +abline(h=c(-s,s), lwd=2, col="lightGray") + +p <- predict(m.ar2, n.ahead = 12) # Make 12 predictions. +print(p) + +## Watch the forecastability decay away from fat values to 0. +## sd(x) is the naive sigma. p$se is the prediction se. +gain <- 100*(1-p$se/sd(gIIP)) +plot.ts(gain, main="Gain in forecast s.d.", ylab="Per cent", + col="blue", lwd=2) + +## Make a pretty picture that puts it all together +ts.plot(gIIP, p$pred, p$pred-1.96*p$se, p$pred+1.96*p$se, + gpars=list(lty=c(1,1,2,2), lwd=c(2,2,1,1), + ylab="IIP growth (%)", col=c("blue","red", "red", "red"))) +grid() +abline(h=mean(gIIP), lty=2, lwd=2, col="lightGray") +legend(x="bottomleft", cex=0.8, bty="n", + lty=c(1,1,2,2), lwd=c(2,1,1,2), + col=c("blue", "red", "red", "lightGray"), + legend=c("IIP", "AR(2) forecasts", "95% C.I.", "Mean IIP growth")) \ No newline at end of file diff --git a/r/Add Two Vectors.r b/r/Add Two Vectors.r new file mode 100644 index 0000000..1e22d09 --- /dev/null +++ b/r/Add Two Vectors.r @@ -0,0 +1,16 @@ +> x +[1] 3 6 8 +> y +[1] 2 9 0 + +> x + y +[1] 5 15 8 + +> x + 1 # 1 is recycled to (1,1,1) +[1] 4 7 9 + +> x + c(1,4) # (1,4) is recycled to (1,4,1) but warning issued +[1] 4 10 9 +Warning message: +In x + c(1, 4) : + longer object length is not a multiple of shorter object length \ No newline at end of file diff --git a/r/All manner of import and export of datasets.r b/r/All manner of import and export of datasets.r new file mode 100644 index 0000000..d73a8c8 --- /dev/null +++ b/r/All manner of import and export of datasets.r @@ -0,0 +1,33 @@ +# Goal: All manner of import and export of datasets. + +# Invent a dataset -- +A <- data.frame( + name=c("a","b","c"), + ownership=c("Case 1","Case 1","Case 2"), + listed.at=c("NSE",NA,"BSE"), + # Firm "b" is unlisted. + is.listed=c(TRUE,FALSE,TRUE), + # R convention - boolean variables are named "is.something" + x=c(2.2,3.3,4.4), + date=as.Date(c("2004-04-04","2005-05-05","2006-06-06")) + ) + +# To a spreadsheet through a CSV file -- +write.table(A,file="demo.csv",sep = ",",col.names = NA,qmethod = "double") +B <- read.table("demo.csv", header = TRUE, sep = ",", row.names = 1) + +# To R as a binary file -- +save(A, file="demo.rda") +load("demo.rda") + +# To the Open XML standard for transport for statistical data -- +library(StatDataML) +writeSDML(A, "/tmp/demo.sdml") +B <- readSDML("/tmp/demo.sdml") + +# To Stata -- +library(foreign) +write.dta(A, "/tmp/demo.dta") +B <- read.dta("/tmp/demo.dta") + +# foreign::write.foreign() also has a pathway to SAS and SPSS. \ No newline at end of file diff --git a/r/An example of simulation-based inference.r b/r/An example of simulation-based inference.r new file mode 100644 index 0000000..8fc70d5 --- /dev/null +++ b/r/An example of simulation-based inference.r @@ -0,0 +1,33 @@ +# Goal: An example of simulation-based inference. +# This is in the context of testing for time-series dependence in +# stock market returns data. +# The code here does the idea of Kim, Nelson, Startz (1991). +# We want to use the distribution of realworld returns data, without +# needing assumptions about normality. +# The null is lack of dependence (i.e. an efficient market). +# So repeatedly, the data is permuted, and the sample ACF is computed. +# This gives us the distribution of the ACF under H0: independence, but +# while using the empirical distribution of the returns data. + +# Weekly returns on Nifty, 1/1/2002 to 31/12/2003, 104 weeks of data. +r <- c(-0.70031182197603, 0.421690133064168, -1.20098072984689, 0.143402360644984, 3.81836537549516, 3.17055939373247, 0.305580301919228, 1.23853814691852, 0.81584795095706, -1.51865139747764, -2.71223626421522, -0.784836480094242, 1.09180041170998, 0.397649587762761, -4.11309534220923, -0.263912425099111, -0.0410144239805454, 1.75756212770972, -2.3335373897992, -2.19228764624217, -3.64578978183987, 1.92535789661354, 3.45782867883164, -2.15532607229374, -0.448039988298987, 1.50124793565896, -1.45871585874362, -2.13459863369767, -6.2128068251802, -1.94482987066289, 0.751294815735637, 1.78244982829590, 1.61567494389745, 1.53557708728931, -1.53557708728931, -0.322061470004265, -2.28394919698225, 0.70399304137414, -2.93580952607737, 2.38125098034425, 0.0617697039252185, -4.14482733720716, 2.04397528093754, 0.576400673606603, 3.43072725191913, 2.96465382864843, 2.89833358015583, 1.85387040058336, 1.52136515035952, -0.637268376944444, 1.75418926224609, -0.804391905851354, -0.861816058320475, 0.576902488444109, -2.84259880663331, -1.35375536139417, 1.49096529042234, -2.05404881010045, 2.86868849528146, -0.258270670200478, -4.4515881438687, -1.73055019137092, 3.04427015714648, -2.94928202352018, 1.62081315773994, -6.83117945164824, -0.962715713711582, -1.75875847071740, 1.50330330252721, -0.0479705789653728, 3.68968303215933, -0.535807567290103, 3.94034871061182, 3.85787174417738, 0.932185956989873, 4.08598654183674, 2.27343783689715, 1.13958830440017, 2.01737201171230, -1.88131458327554, 1.97596267156648, 2.79857144562001, 2.22470306481695, 2.03212951411427, 4.95626853448883, 3.40400972901396, 3.03840139165246, -1.89863129741417, -3.70832135042951, 4.78478922155396, 4.3973589590097, 4.9667050392987, 2.99775078737081, -4.12349101552438, 3.25638269809945, 2.29683376253966, -2.64772825878214, -0.630835277076258, 4.72528848505451, 1.87368447333380, 3.17543946162564, 4.58174427843208, 3.23625985632168, 2.29777651227296) + +# The 1st autocorrelation from the sample: +acf(r, 1, plot=FALSE)$acf[2] + +# Obtain 1000 draws from the distribution of the 1st autocorrelation +# under the null of independence: +set.seed <- 101 +simulated <- replicate(1000, acf(r[sample(1:104, replace=FALSE)], 1, plot=FALSE)$acf[2]) +# At 95% -- +quantile(simulated, probs=c(.025,.975)) +# At 99% -- +quantile(simulated, probs=c(.005,.995)) + +# So we can reject the null at 95% but not at 99%. + +# A pretty picture. +plot(density(simulated), col="blue") +abline(v=0) +abline(v=quantile(simulated, probs=c(.025,.975)), lwd=2, col="purple") +abline(v=acf(r, 1, plot=FALSE)$acf[2], lty=2, lwd=4, col="yellow") \ No newline at end of file diff --git a/r/Associative arrays (as in awk) or hashes (as in perl).r b/r/Associative arrays (as in awk) or hashes (as in perl).r new file mode 100644 index 0000000..5802114 --- /dev/null +++ b/r/Associative arrays (as in awk) or hashes (as in perl).r @@ -0,0 +1,52 @@ +# Goal: Associative arrays (as in awk) or hashes (as in perl). +# Or, more generally, adventures in R addressing. + +# Here's a plain R vector: +x <- c(2,3,7,9) +# But now I tag every elem with labels: +names(x) <- c("kal","sho","sad","aja") +# Associative array operations: +x["kal"] <- 12 +# Pretty printing the entire associative array: +x + +# This works for matrices too: +m <- matrix(runif(10), nrow=5) +rownames(m) <- c("violet","indigo","blue","green","yellow") +colnames(m) <- c("Asia","Africa") +# The full matrix -- +m +# Or even better -- +library(xtable) +xtable(m) + +# Now address symbolically -- +m[,"Africa"] +m["indigo",] +m["indigo","Africa"] + +# The "in" operator, as in awk -- +for (colour in c("yellow", "orange", "red")) { + if (colour %in% rownames(m)) { + cat("For Africa and ", colour, " we have ", m[colour, "Africa"], "\n") + } else { + cat("Colour ", colour, " does not exist in the hash.\n") + } +} + +# This works for data frames also -- +D <- data.frame(m) +D +# Look closely at what happened -- +str(D) # The colours are the rownames(D). + +# Operations -- +D$Africa +D[,"Africa"] +D["yellow",] +# or +subset(D, rownames(D)=="yellow") + +colnames(D) <- c("Antarctica","America") +D +D$America \ No newline at end of file diff --git a/r/Binary to Decimal.r b/r/Binary to Decimal.r new file mode 100644 index 0000000..ee7a066 --- /dev/null +++ b/r/Binary to Decimal.r @@ -0,0 +1,16 @@ +Binary to Decimal +# Program to convert decimal +# number into binary number +# using recursive function + +convert_to_binary <- function(n) { + + if(n > 1) { + convert_to_binary(as.integer(n/2)) + } + cat(n %% 2) +} +Output + +> convert_to_binary(52) +110100 \ No newline at end of file diff --git a/r/Check Armstrong number.r b/r/Check Armstrong number.r new file mode 100644 index 0000000..6459233 --- /dev/null +++ b/r/Check Armstrong number.r @@ -0,0 +1,25 @@ + Check Armstrong number +# take input from the user +num = as.integer(readline(prompt="Enter a number: ")) + +# initialize sum +sum = 0 + +# find the sum of the cube of each digit +temp = num +while(temp > 0) { + digit = temp %% 10 + sum = sum + (digit ^ 3) + temp = floor(temp / 10) +} + +# display the result +if(num == sum) { + print(paste(num, "is an Armstrong number")) +} else { + print(paste(num, "is not an Armstrong number")) +} +Output 1 + +Enter a number: 23 +[1] "23 is not an Armstrong number" \ No newline at end of file diff --git a/r/Check Leap Year.r b/r/Check Leap Year.r new file mode 100644 index 0000000..e164ea7 --- /dev/null +++ b/r/Check Leap Year.r @@ -0,0 +1,27 @@ +Check Leap Year +# Program to check if +# the input year is +# a leap year or not + +year = as.integer(readline(prompt="Enter a year: ")) +if((year %% 4) == 0) { + if((year %% 100) == 0) { + if((year %% 400) == 0) { + print(paste(year,"is a leap year")) + } else { + print(paste(year,"is not a leap year")) + } + } else { + print(paste(year,"is a leap year")) + } +} else { + print(paste(year,"is not a leap year")) +} +Output 1 + +Enter a year: 1900 +[1] "1900 is not a leap year" +Output 2 + +Enter a year: 2000 +[1] "2000 is a leap year" \ No newline at end of file diff --git a/r/Check Odd and Even Number.r b/r/Check Odd and Even Number.r new file mode 100644 index 0000000..f4eceeb --- /dev/null +++ b/r/Check Odd and Even Number.r @@ -0,0 +1,21 @@ +Check Odd and Even Number +# Program to check if +# the input number is odd or even. +# A number is even if division +# by 2 give a remainder of 0. +# If remainder is 1, it is odd. + +num = as.integer(readline(prompt="Enter a number: ")) +if((num %% 2) == 0) { + print(paste(num,"is Even")) +} else { + print(paste(num,"is Odd")) +} +Output 1 + +Enter a number: 89 +[1] "89 is Odd" +Output 2 + +Enter a number: 0 +[1] "0 is Even" \ No newline at end of file diff --git a/r/Check Positive, Negative or Zero.r b/r/Check Positive, Negative or Zero.r new file mode 100644 index 0000000..e1505ba --- /dev/null +++ b/r/Check Positive, Negative or Zero.r @@ -0,0 +1,24 @@ +Check Positive, Negative or Zero +# In this program, we input a number +# check if the number is positive or +# negative or zero and display +# an appropriate message + +num = as.double(readline(prompt="Enter a number: ")) +if(num > 0) { + print("Positive number") +} else { + if(num == 0) { + print("Zero") + } else { + print("Negative number") + } +} +Output 1 + +Enter a number: -9.6 +[1] "Negative number" +Output 2 + +Enter a number: 2 +[1] "Positive number" \ No newline at end of file diff --git a/r/Check Prime Number.r b/r/Check Prime Number.r new file mode 100644 index 0000000..bb54021 --- /dev/null +++ b/r/Check Prime Number.r @@ -0,0 +1,27 @@ +Check Prime Number +# Program to check if +# the input number is +# prime or not + +# take input from the user +num = as.integer(readline(prompt="Enter a number: ")) + +flag = 0 +# prime numbers are greater than 1 +if(num > 1) { + # check for factors + flag = 1 + for(i in 2:(num-1)) { + if ((num %% i) == 0) { + flag = 0 + break + } + } +} + +if(num == 2) flag = 1 +if(flag == 1) { + print(paste(num,"is a prime number")) +} else { + print(paste(num,"is not a prime number")) +} \ No newline at end of file diff --git a/r/Compute LCM in R.r b/r/Compute LCM in R.r new file mode 100644 index 0000000..975ccae --- /dev/null +++ b/r/Compute LCM in R.r @@ -0,0 +1,30 @@ +Compute LCM in R +# Program to find the L.C.M. of two input number +lcm <- function(x, y) { + # choose the greater number + if(x > y) { + greater = x + } else { + greater = y + } + + while(TRUE) { + if((greater %% x == 0) && (greater %% y == 0)) { + lcm = greater + break + } + greater = greater + 1 + } + return(lcm) +} + +# take input from the user +num1 = as.integer(readline(prompt = "Enter first number: ")) +num2 = as.integer(readline(prompt = "Enter second number: ")) + +print(paste("The L.C.M. of", num1,"and", num2,"is", lcm(num1, num2))) +Output + +Enter first number: 24 +Enter second number: 25 +[1] "The L.C.M. of 24 and 25 is 600" \ No newline at end of file diff --git a/r/Do bootstrap inference, as an example, for a sample median.r b/r/Do bootstrap inference, as an example, for a sample median.r new file mode 100644 index 0000000..10467bb --- /dev/null +++ b/r/Do bootstrap inference, as an example, for a sample median.r @@ -0,0 +1,15 @@ +# Goals: Do bootstrap inference, as an example, for a sample median. + +library(boot) + +samplemedian <- function(x, d) { # d is a vector of integer indexes + return(median(x[d])) # The genius is in the x[d] notation +} + +data <- rnorm(50) # Generate a dataset with 50 obs +b <- boot(data, samplemedian, R=2000) # 2000 bootstrap replications +cat("Sample median has a sigma of ", sd(b$t[,1]), "\n") +plot(b) + +# Make a 99% confidence interval +boot.ci(b, conf=0.99, type="basic") \ No newline at end of file diff --git a/r/Doing OLS.r b/r/Doing OLS.r new file mode 100644 index 0000000..e45a4f7 --- /dev/null +++ b/r/Doing OLS.r @@ -0,0 +1,44 @@ +# Goal: Simulate a dataset from the OLS model and obtain +# obtain OLS estimates for it. + +x <- runif(100, 0, 10) # 100 draws from U(0,10) +y <- 2 + 3*x + rnorm(100) # beta = [2, 3] and sigma = 1 + +# You want to just look at OLS results? +summary(lm(y ~ x)) + +# Suppose x and y were packed together in a data frame -- +D <- data.frame(x,y) +summary(lm(y ~ x, D)) + +# Full and elaborate steps -- +d <- lm(y ~ x) +# Learn about this object by saying ?lm and str(d) +# Compact model results -- +print(d) +# Pretty graphics for regression diagnostics -- +par(mfrow=c(2,2)) +plot(d) + +d <- summary(d) +# Detailed model results -- +print(d) +# Learn about this object by saying ?summary.lm and by saying str(d) +cat("OLS gave slope of ", d$coefficients[2,1], + "and a error sigma of ", d$sigma, "\n") + + +## I need to drop down to a smaller dataset now -- +x <- runif(10) +y <- 2 + 3*x + rnorm(10) +m <- lm(y ~ x) + +# Now R supplies a wide range of generic functions which extract +# useful things out of the result of estimation of many kinds of models. + +residuals(m) +fitted(m) +AIC(m) +AIC(m, k=log(10)) # SBC +vcov(m) +logLik(m) \ No newline at end of file diff --git a/r/Dummy variables in regression.r b/r/Dummy variables in regression.r new file mode 100644 index 0000000..d02abe2 --- /dev/null +++ b/r/Dummy variables in regression.r @@ -0,0 +1,55 @@ +# Goal: "Dummy variables" in regression. + +# Suppose you have this data: +people = data.frame( + age = c(21,62,54,49,52,38), + education = c("college", "school", "none", "school", "college", "none"), + education.code = c( 2, 1, 0, 1, 2, 0 ) + ) +# Here people$education is a string categorical variable and +# people$education.code is the same thing, with a numerical coding system. +people + +# Note the structure of the dataset -- +str(people) +# The strings supplied for `education' have been treated (correctly) as +# a factor, but education.code is being treated as an integer and not as +# a factor. + + +# We want to do a dummy variable regression. Normally you would have: +# 1 Chosen college as the omitted category +# 2 Made a dummy for "none" named educationnone +# 3 Made a dummy for "school" named educationschool +# 4 Ran a regression like lm(age ~ educationnone + educationschool, people) +# But this is R. Things are cool: +lm(age ~ education, people) + +# ! :-) +# When you feed him an explanatory variable like education, he does all +# these steps automatically. (He chose college as the omitted category). + +# If you use an integer coding, then the obvious thing goes wrong -- +lm(age ~ education.code, people) +# because he's thinking that education.code is an integer explanatory +# variable. So you need to: + +lm(age ~ factor(education.code), people) +# (he choose a different omitted category) + +# Alternatively, fix up the dataset -- +people$education.code <- factor(people$education.code) +lm(age ~ education.code, people) + +# +# Bottom line: +# Once the dataset has categorical variables correctly represented as factors, i.e. as +str(people) +# doing OLS in R induces automatic generation of dummy variables while leaving one out: +lm(age ~ education, people) +lm(age ~ education.code, people) + +# But what if you want the X matrix? +m <- lm(age ~ education, people) +model.matrix(m) +# This is the design matrix that went into the regression m. \ No newline at end of file diff --git a/r/Fibonacci Sequence in R.r b/r/Fibonacci Sequence in R.r new file mode 100644 index 0000000..d9136db --- /dev/null +++ b/r/Fibonacci Sequence in R.r @@ -0,0 +1,38 @@ +Fibonacci Sequence in R +# Program to diplay the Fibonacci +# sequence up to n-th term using +# recursive functions + +recurse_fibonacci <- function(n) { + if(n <= 1) { + return(n) + } else { + return(recurse_fibonacci(n-1) + recurse_fibonacci(n-2)) + } +} + +# take input from the user +nterms = as.integer(readline(prompt="How many terms? ")) + +# check if the number of terms is valid +if(nterms <= 0) { + print("Plese enter a positive integer") +} else { + print("Fibonacci sequence:") + for(i in 0:(nterms-1)) { + print(recurse_fibonacci(i)) + } +} +Output + +How many terms? 9 +[1] "Fibonacci sequence:" +[1] 0 +[1] 1 +[1] 1 +[1] 2 +[1] 3 +[1] 5 +[1] 8 +[1] 13 +[1] 21 \ No newline at end of file diff --git a/r/Find Factorial of a number using recursion.r b/r/Find Factorial of a number using recursion.r new file mode 100644 index 0000000..912c338 --- /dev/null +++ b/r/Find Factorial of a number using recursion.r @@ -0,0 +1,14 @@ +Find Factorial of a number using recursion +recur_factorial <- function(n) { + if(n <= 1) { + return(1) + } else { + return(n * recur_factorial(n-1)) + } +} + + +Output + +> recur_factorial(5) +[1] 120 \ No newline at end of file diff --git a/r/Find Minimum and Maximum.r b/r/Find Minimum and Maximum.r new file mode 100644 index 0000000..9f5f45f --- /dev/null +++ b/r/Find Minimum and Maximum.r @@ -0,0 +1,33 @@ +Find Minimum and Maximum +> x +[1] 5 8 3 9 2 7 4 6 10 + +> # find the minimum +> min(x) +[1] 2 + +> # find the maximum +> max(x) +[1] 10 + +> # find the range +> range(x) +[1] 2 10 +If we want to find where the minimum or maximum is located, i.e. the index instead of the actual value, then we can use which.min() and which.max() functions. + +Note that these functions will return the index of the first minimum or maximum in case multiple of them exists. + +> x +[1] 5 8 3 9 2 7 4 6 10 + +> # find index of the minimum +> which.min(x) +[1] 5 + +> # find index of the minimum +> which.max(x) +[1] 9 + +> # alternate way to find the minimum +> x[which.min(x)] +[1] 2 \ No newline at end of file diff --git a/r/Find factors of a number.r b/r/Find factors of a number.r new file mode 100644 index 0000000..f8d563f --- /dev/null +++ b/r/Find factors of a number.r @@ -0,0 +1,29 @@ +Find factors of a number +print_factors <- function(x) { + print(paste("The factors of",x,"are:")) + for(i in 1:x) { + if((x %% i) == 0) { + print(i) + } + } +} +Output + +> print_factors(120) +[1] "The factors of 120 are:" +[1] 1 +[1] 2 +[1] 3 +[1] 4 +[1] 5 +[1] 6 +[1] 8 +[1] 10 +[1] 12 +[1] 15 +[1] 20 +[1] 24 +[1] 30 +[1] 40 +[1] 60 +[1] 120 \ No newline at end of file diff --git a/r/Find sum of natural numbers without formula.r b/r/Find sum of natural numbers without formula.r new file mode 100644 index 0000000..3bf90fa --- /dev/null +++ b/r/Find sum of natural numbers without formula.r @@ -0,0 +1,21 @@ +Find sum of natural numbers without formula +# take input from the user +num = as.integer(readline(prompt = "Enter a number: ")) + +if(num < 0) { + print("Enter a positive number") +} else { + sum = 0 + + # use while loop to iterate until zero + while(num > 0) { + sum = sum + num + num = num - 1 + } + + print(paste("The sum is", sum)) +} +Output + +Enter a number: 10 +[1] "The sum is 55" \ No newline at end of file diff --git a/r/Find the factorial of a number.r b/r/Find the factorial of a number.r new file mode 100644 index 0000000..3aba6c4 --- /dev/null +++ b/r/Find the factorial of a number.r @@ -0,0 +1,20 @@ +#Find the factorial of a number +# take input from the user +num = as.integer(readline(prompt="Enter a number: ")) +factorial = 1 + +# check is the number is negative, positive or zero +if(num < 0) { + print("Sorry, factorial does not exist for negative numbers") +} else if(num == 0) { + print("The factorial of 0 is 1") +} else { + for(i in 1:num) { + factorial = factorial * i + } + print(paste("The factorial of", num ,"is",factorial)) +} +Output + +Enter a number: 8 +[1] "The factorial of 8 is 40320" \ No newline at end of file diff --git a/r/From Uniform Distribution.r b/r/From Uniform Distribution.r new file mode 100644 index 0000000..1ecd187 --- /dev/null +++ b/r/From Uniform Distribution.r @@ -0,0 +1,8 @@ +> runif(1) # generates 1 random number +[1] 0.3984754 + +> runif(3) # generates 3 random number +[1] 0.8090284 0.1797232 0.6803607 + +> runif(3, min=5, max=10) # define the range between 5 and 10 +[1] 7.099781 8.355461 5.173133 \ No newline at end of file diff --git a/r/Handling missing data.r b/r/Handling missing data.r new file mode 100644 index 0000000..0ccb77a --- /dev/null +++ b/r/Handling missing data.r @@ -0,0 +1,22 @@ +# Goal: +# A stock is traded on 2 exchanges. +# Price data is missing at random on both exchanges owing to non-trading. +# We want to make a single price time-series utilising information +# from both exchanges. I.e., missing data for exchange 1 will +# be replaced by information for exchange 2 (if observed). + +# Let's create some example data for the problem. +e1 <- runif(15) # Prices on exchange 1 +e2 <- e1 + 0.05*rnorm(15) # Prices on exchange 2. +cbind(e1, e2) +# Blow away 5 points from each at random. +e1[sample(1:15, 5)] <- NA +e2[sample(1:15, 5)] <- NA +cbind(e1, e2) + +# Now how do we reconstruct a time-series that tries to utilise both? +combined <- e1 # Do use the more liquid exchange here. +missing <- is.na(combined) +combined[missing] <- e2[missing] # if it's also missing, I don't care. +cbind(e1, e2, combined) +# There you are. \ No newline at end of file diff --git a/r/Joint distributions, marginal distributions, useful tables.r b/r/Joint distributions, marginal distributions, useful tables.r new file mode 100644 index 0000000..6053d66 --- /dev/null +++ b/r/Joint distributions, marginal distributions, useful tables.r @@ -0,0 +1,43 @@ +# Goal: Joint distributions, marginal distributions, useful tables. + +# First let me invent some fake data +set.seed(102) # This yields a good illustration. +x <- sample(1:3, 15, replace=TRUE) +education <- factor(x, labels=c("None", "School", "College")) +x <- sample(1:2, 15, replace=TRUE) +gender <- factor(x, labels=c("Male", "Female")) +age <- runif(15, min=20,max=60) + +D <- data.frame(age, gender, education) +rm(x,age,gender,education) +print(D) + +# Table about education +table(D$education) + +# Table about education and gender -- +table(D$gender, D$education) +# Joint distribution of education and gender -- +table(D$gender, D$education)/nrow(D) + +# Add in the marginal distributions also +addmargins(table(D$gender, D$education)) +addmargins(table(D$gender, D$education))/nrow(D) + +# Generate a good LaTeX table out of it -- +library(xtable) +xtable(addmargins(table(D$gender, D$education))/nrow(D), + digits=c(0,2,2,2,2)) # You have to do | and \hline manually. + +# Study age by education category +by(D$age, D$gender, mean) +by(D$age, D$gender, sd) +by(D$age, D$gender, summary) + +# Two-way table showing average age depending on education & gender +a <- matrix(by(D$age, list(D$gender, D$education), mean), nrow=2) +rownames(a) <- levels(D$gender) +colnames(a) <- levels(D$education) +print(a) +# or, of course, +print(xtable(a)) \ No newline at end of file diff --git a/r/Least squares dummy variable' (LSDV) or `fixed effects' model.r b/r/Least squares dummy variable' (LSDV) or `fixed effects' model.r new file mode 100644 index 0000000..5aa439c --- /dev/null +++ b/r/Least squares dummy variable' (LSDV) or `fixed effects' model.r @@ -0,0 +1,23 @@ +# Goals: Simulate a dataset from a "fixed effects" model, and +# obtain "least squares dummy variable" (LSDV) estimates. +# +# We do this in the context of a familiar "earnings function" - +# log earnings is quadratic in log experience, with parallel shifts by +# education category. + +# Create an education factor with 4 levels -- +education <- factor(sample(1:4,1000, replace=TRUE), + labels=c("none", "school", "college", "beyond")) +# Simulate an experience variable with a plausible range -- +experience <- 30*runif(1000) # experience from 0 to 20 years +# Make the intercept vary by education category between 4 given values -- +intercept <- c(0.5,1,1.5,2)[education] + +# Simulate the log earnings -- +log.earnings <- intercept + + 2*experience - 0.05*experience*experience + rnorm(1000) +A <- data.frame(education, experience, e2=experience*experience, log.earnings) +summary(A) + +# The OLS path to LSDV -- +summary(lm(log.earnings ~ -1 + education + experience + e2, A)) \ No newline at end of file diff --git a/r/Make a time-series object using the zoo package.r b/r/Make a time-series object using the zoo package.r new file mode 100644 index 0000000..6ee5cb7 --- /dev/null +++ b/r/Make a time-series object using the zoo package.r @@ -0,0 +1,31 @@ +# Goal: Make a time-series object using the "zoo" package + +A <- data.frame(date=c("1995-01-01", "1995-01-02", "1995-01-03", "1995-01-06"), + x=runif(4), + y=runif(4)) +A$date <- as.Date(A$date) # yyyy-mm-dd is the default format +# So far there's nothing new - it's just a data frame. I have hand- +# constructed A but you could equally have obtained it using read.table(). + +# I want to make a zoo matrix out of the numerical columns of A +library(zoo) +B <- A +B$date <- NULL +z <- zoo(as.matrix(B), order.by=A$date) +rm(A, B) + +# So now you are holding "z", a "zoo" object. You can do many cool +# things with it. +# See http://www.google.com/search?hl=en&q=zoo+quickref+achim&btnI=I%27m+Feeling+Lucky + +# To drop down to a plain data matrix, say +C <- coredata(z) +rownames(C) <- as.character(time(z)) +# Compare -- +str(C) +str(z) + +# The above is a tedious way of doing these things, designed to give you +# an insight into what is going on. If you just want to read a file +# into a zoo object, a very short path is something like: +# z <- read.zoo(filename, format="%d %b %Y") \ No newline at end of file diff --git a/r/Make pictures in PDF files that can be put into a paper.r b/r/Make pictures in PDF files that can be put into a paper.r new file mode 100644 index 0000000..2bf6cdd --- /dev/null +++ b/r/Make pictures in PDF files that can be put into a paper.r @@ -0,0 +1,20 @@ +# Goal: Make pictures in PDF files that can be put into a paper. + +xpts <- seq(-3,3,.05) + +# Here is my suggested setup for a two-column picture -- +pdf("demo2.pdf", width=5.6, height=2.8, bg="cadetblue1", pointsize=8) +par(mai=c(.6,.6,.2,.2)) +plot(xpts, sin(xpts*xpts), type="l", lwd=2, col="cadetblue4", + xlab="x", ylab="sin(x*x)") +grid(col="white", lty=1, lwd=.2) +abline(h=0, v=0) + +# My suggested setup for a square one-column picture -- +pdf("demo1.pdf", width=2.8, height=2.8, bg="cadetblue1", pointsize=8) +par(mai=c(.6,.6,.2,.2)) +plot(xpts, sin(xpts*xpts), type="l", lwd=2, col="cadetblue4", + xlab="x", ylab="sin(x*x)") +grid(col="white", lty=1, lwd=.2) +abline(h=0, v=0) + diff --git a/r/Multiplication Table.r b/r/Multiplication Table.r new file mode 100644 index 0000000..508ff38 --- /dev/null +++ b/r/Multiplication Table.r @@ -0,0 +1,25 @@ +Multiplication Table +# Program to find the multiplication +# table (from 1 to 10) +# of a number input by the user + +# take input from the user +num = as.integer(readline(prompt = "Enter a number: ")) + +# use for loop to iterate 10 times +for(i in 1:10) { + print(paste(num,'x', i, '=', num*i)) +} +Output + +Enter a number: 7 +[1] "7 x 1 = 7" +[1] "7 x 2 = 14" +[1] "7 x 3 = 21" +[1] "7 x 4 = 28" +[1] "7 x 5 = 35" +[1] "7 x 6 = 42" +[1] "7 x 7 = 49" +[1] "7 x 8 = 56" +[1] "7 x 9 = 63" +[1] "7 x 10 = 70" \ No newline at end of file diff --git a/r/Plotting two series on one graph, one with a left y axis and another with a right y axis.r b/r/Plotting two series on one graph, one with a left y axis and another with a right y axis.r new file mode 100644 index 0000000..dc55570 --- /dev/null +++ b/r/Plotting two series on one graph, one with a left y axis and another with a right y axis.r @@ -0,0 +1,13 @@ +# Goal: Display two series on one plot, one with a left y axis +# and another with a right y axis. + +y1 <- cumsum(rnorm(100)) +y2 <- cumsum(rnorm(100, mean=0.2)) + +par(mai=c(.8, .8, .2, .8)) +plot(1:100, y1, type="l", col="blue", xlab="X axis label", ylab="Left legend") +par(new=TRUE) +plot(1:100, y2, type="l", ann=FALSE, yaxt="n") +axis(4) +legend(x="topleft", bty="n", lty=c(1,1), col=c("blue","black"), + legend=c("String 1 (left scale)", "String 2 (right scale)")) \ No newline at end of file diff --git a/r/Prices and returns.r b/r/Prices and returns.r new file mode 100644 index 0000000..b698019 --- /dev/null +++ b/r/Prices and returns.r @@ -0,0 +1,99 @@ +# Goal: Prices and returns + +# I like to multiply returns by 100 so as to have "units in percent". +# In other words, I like it for 5% to be a value like 5 rather than 0.05. + +################################################################### +# I. Simulate random-walk prices, switch between prices & returns. +################################################################### +# Simulate a time-series of PRICES drawn from a random walk +# where one-period returns are i.i.d. N(mu, sigma^2). +ranrw <- function(mu, sigma, p0=100, T=100) { + cumprod(c(p0, 1 + (rnorm(n=T, mean=mu, sd=sigma)/100))) +} +prices2returns <- function(x) { + 100*diff(log(x)) +} +returns2prices <- function(r, p0=100) { + c(p0, p0 * exp(cumsum(r/100))) +} + +cat("Simulate 25 points from a random walk starting at 1500 --\n") +p <- ranrw(0.05, 1.4, p0=1500, T=25) + # gives you a 25-long series, starting with a price of 1500, where + # one-period returns are N(0.05,1.4^2) percent. +print(p) + +cat("Convert to returns--\n") +r <- prices2returns(p) +print(r) + +cat("Go back from returns to prices --\n") +goback <- returns2prices(r, 1500) +print(goback) + +################################################################### +# II. Plenty of powerful things you can do with returns.... +################################################################### +summary(r); sd(r) # summary statistics +plot(density(r)) # kernel density plot +acf(r) # Autocorrelation function +ar(r) # Estimate a AIC-minimising AR model +Box.test(r, lag=2, type="Ljung") # Box-Ljung test +library(tseries) +runs.test(factor(sign(r))) # Runs test +bds.test(r) # BDS test. + +################################################################### +# III. Visualisation and the random walk +################################################################### +# I want to obtain intuition into what kinds of price series can happen, +# given a starting price, a mean return, and a given standard deviation. +# This function simulates out 10000 days of a price time-series at a time, +# and waits for you to click in the graph window, after which a second +# series is painted, and so on. Make the graph window very big and +# sit back and admire. +# The point is to eyeball many series and thus obtain some intuition +# into what the random walk does. +visualisation <- function(p0, s, mu, labelstring) { + N <- 10000 + x <- (1:(N+1))/250 # Unit of years + while (1) { + plot(x, ranrw(mu, s, p0, N), ylab="Level", log="y", + type="l", col="red", xlab="Time (years)", + main=paste("40 years of a process much like", labelstring)) + grid() + z=locator(1) + } +} + +# Nifty -- assuming sigma of 1.4% a day and E(returns) of 13% a year +visualisation(2600, 1.4, 13/250, "Nifty") + +# The numerical values here are used to think about what the INR/USD +# exchange rate would have looked like if it started from 31.37, had +# a mean depreciation of 5% per year, and had the daily vol of a floating +# exchange rate like EUR/USD. +visualisation(31.37, 0.7, 5/365, "INR/USD (NOT!) with daily sigma=0.7") +# This is of course not like the INR/USD series in the real world - +# which is neither a random walk nor does it have a vol of 0.7% a day. + +# The numerical values here are used to think about what the USD/EUR +# exchange rate, starting with 1, having no drift, and having the observed +# daily vol of 0.7. (This is about right). +visualisation(1, 0.7, 0, "USD/EUR with no drift") + +################################################################### +# IV. A monte carlo experiment about the runs test +################################################################### +# Measure the effectiveness of the runs test when faced with an +# AR(1) process of length 100 with a coeff of 0.1 +set.seed(101) +one.ts <- function() {arima.sim(list(order = c(1,0,0), ar = 0.1), n=100)} +table(replicate(1000, runs.test(factor(sign(one.ts())))$p.value < 0.05)) +# We find that the runs test throws up a prob value of below 0.05 +# for 91 out of 1000 experiments. +# Wow! :-) +# To understand this, you need to look up the man pages of: +# set.seed, arima.sim, sign, factor, runs.test, replicate, table. +# e.g. say ?replicate \ No newline at end of file diff --git a/r/Print Fibonacci Sequence.r b/r/Print Fibonacci Sequence.r new file mode 100644 index 0000000..023077c --- /dev/null +++ b/r/Print Fibonacci Sequence.r @@ -0,0 +1,41 @@ +Print Fibonacci Sequence +# take input from the user +nterms = as.integer(readline(prompt="How many terms? ")) + +# first two terms +n1 = 0 +n2 = 1 +count = 2 + +# check if the number of terms is valid +if(nterms <= 0) { + print("Plese enter a positive integer") +} else { + if(nterms == 1) { + print("Fibonacci sequence:") + print(n1) + } else { + print("Fibonacci sequence:") + print(n1) + print(n2) + while(count < nterms) { + nth = n1 + n2 + print(nth) + # update values + n1 = n2 + n2 = nth + count = count + 1 + } + } +} +Output + +How many terms? 7 +[1] "Fibonacci sequence:" +[1] 0 +[1] 1 +[1] 1 +[1] 2 +[1] 3 +[1] 5 +[1] 8 \ No newline at end of file diff --git a/r/Program to Find GCD.r b/r/Program to Find GCD.r new file mode 100644 index 0000000..c635d93 --- /dev/null +++ b/r/Program to Find GCD.r @@ -0,0 +1,30 @@ +Program to Find GCD +# Program to find the +# H.C.F of two input number + +# define a function +hcf <- function(x, y) { + # choose the smaller number + if(x > y) { + smaller = y + } else { + smaller = x + } + for(i in 1:smaller) { + if((x %% i == 0) && (y %% i == 0)) { + hcf = i + } + } + return(hcf) +} + +# take input from the user +num1 = as.integer(readline(prompt = "Enter first number: ")) +num2 = as.integer(readline(prompt = "Enter second number: ")) + +print(paste("The H.C.F. of", num1,"and", num2,"is", hcf(num1, num2))) +Output + +Enter first number: 72 +Enter second number: 120 +[1] "The H.C.F. of 72 and 120 is 24" \ No newline at end of file diff --git a/r/Quartiles deciles tables graphs.r b/r/Quartiles deciles tables graphs.r new file mode 100644 index 0000000..e71b5b7 --- /dev/null +++ b/r/Quartiles deciles tables graphs.r @@ -0,0 +1,52 @@ +# Get the data in place -- +load(file="demo.rda") +summary(firms) + +# Look at it -- +plot(density(log(firms$mktcap))) +plot(firms$mktcap, firms$spread, type="p", cex=.2, col="blue", log="xy", + xlab="Market cap (Mln USD)", ylab="Bid/offer spread (bps)") +m=lm(log(spread) ~ log(mktcap), firms) +summary(m) + +# Making deciles -- +library(gtools) +library(gdata) + # for deciles (default=quartiles) +size.category = quantcut(firms$mktcap, q=seq(0, 1, 0.1), labels=F) +table(size.category) +means = aggregate(firms, list(size.category), mean) +print(data.frame(means$mktcap,means$spread)) + +# Make a picture combining the sample mean of spread (in each decile) +# with the weighted average sample mean of the spread (in each decile), +# where weights are proportional to size. +wtd.means = by(firms, size.category, + function(piece) (sum(piece$mktcap*piece$spread)/sum(piece$mktcap))) +lines(means$mktcap, means$spread, type="b", lwd=2, col="green", pch=19) +lines(means$mktcap, wtd.means, type="b", lwd=2, col="red", pch=19) +legend(x=0.25, y=0.5, bty="n", + col=c("blue", "green", "red"), + lty=c(0, 1, 1), lwd=c(0,2,2), + pch=c(0,19,19), + legend=c("firm", "Mean spread in size deciles", + "Size weighted mean spread in size deciles")) + +# Within group standard deviations -- +aggregate(firms, list(size.category), sd) + +# Now I do quartiles by BOTH mktcap and spread. +size.quartiles = quantcut(firms$mktcap, labels=F) +spread.quartiles = quantcut(firms$spread, labels=F) +table(size.quartiles, spread.quartiles) +# Re-express everything as joint probabilities +table(size.quartiles, spread.quartiles)/nrow(firms) +# Compute cell means at every point in the joint table: +aggregate(firms, list(size.quartiles, spread.quartiles), mean) + +# Make pretty two-way tables +aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, nobs) +aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, mean) +aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, sd) +aggregate.table(firms$spread, size.quartiles, spread.quartiles, mean) +aggregate.table(firms$spread, size.quartiles, spread.quartiles, sd) \ No newline at end of file diff --git a/r/R Hello World Program.r b/r/R Hello World Program.r new file mode 100644 index 0000000..b86911b --- /dev/null +++ b/r/R Hello World Program.r @@ -0,0 +1,11 @@ +> # We can use the print() function +> print("Hello World!") +[1] "Hello World!" + +> # Quotes can be suppressed in the output +> print("Hello World!", quote = FALSE) +[1] Hello World! + +> # If there are more than 1 item, we can concatenate using paste() +> print(paste("How","are","you?")) +[1] "How are you?" \ No newline at end of file diff --git a/r/R syntax where model specification is an argument to a function.r b/r/R syntax where model specification is an argument to a function.r new file mode 100644 index 0000000..f41705b --- /dev/null +++ b/r/R syntax where model specification is an argument to a function.r @@ -0,0 +1,12 @@ +# Goal: R syntax where model specification is an argument to a function. + +# Invent a dataset +x <- runif(100); y <- runif(100); z <- 2 + 3*x + 4*y + rnorm(100) +D <- data.frame(x=x, y=y, z=z) + +amodel <- function(modelstring) { + summary(lm(modelstring, D)) +} + +amodel(z ~ x) +amodel(z ~ y) \ No newline at end of file diff --git a/r/README.md b/r/README.md new file mode 100644 index 0000000..2077133 --- /dev/null +++ b/r/README.md @@ -0,0 +1,3 @@ +# R programming language + +R is a programming language designed for statistical computing and graphics purposes. Contains code that can be executed within the R software environment. \ No newline at end of file diff --git a/r/Reading and writing ascii files, reading and writing binary files.r b/r/Reading and writing ascii files, reading and writing binary files.r new file mode 100644 index 0000000..a27066c --- /dev/null +++ b/r/Reading and writing ascii files, reading and writing binary files.r @@ -0,0 +1,43 @@ +# Goal: Reading and writing ascii files, reading and writing binary files. +# And, to measure how much faster it is working with binary files. + +# First manufacture a tall data frame: + # FYI -- runif(10) yields 10 U(0,1) random numbers. +B = data.frame(x1=runif(100000), x2=runif(100000), x3=runif(100000)) +summary(B) + +# Write out ascii file: +write.table(B, file = "/tmp/foo.csv", sep = ",", col.names = NA) +# Read in this resulting ascii file: +C=read.table("/tmp/foo.csv", header = TRUE, sep = ",", row.names=1) +# Write a binary file out of dataset C: +save(C, file="/tmp/foo.binary") +# Delete the dataset C: +rm(C) +# Restore from foo.binary: +load("/tmp/foo.binary") +summary(C) # should yield the same results + # as summary(B) above. + + +# Now we time all these operations -- +cat("Time creation of dataset:\n") +system.time({ + B = data.frame(x1=runif(100000), x2=runif(100000), x3=runif(100000)) +}) + +cat("Time writing an ascii file out of dataset B:\n") +system.time( + write.table(B, file = "/tmp/foo.csv", sep = ",", col.names = NA) + ) + +cat("Time reading an ascii file into dataset C:\n") +system.time( + {C=read.table("/tmp/foo.csv", header = TRUE, sep=",", row.names=1) + }) + +cat("Time writing a binary file out of dataset C:\n") +system.time(save(C, file="/tmp/foo.binary")) + +cat("Time reading a binary file + variablenames from /tmp/foo.binary:\n") +system.time(load("/tmp/foo.binary")) # and then read it in from binary file \ No newline at end of file diff --git a/r/Reading in a Microsoft .r b/r/Reading in a Microsoft .r new file mode 100644 index 0000000..ef25af4 --- /dev/null +++ b/r/Reading in a Microsoft .r @@ -0,0 +1,15 @@ +# Goal: Reading in a Microsoft .xls file directly + +library(gdata) +a <- read.xls("file.xls", sheet=2) # This reads in the 2nd sheet + +# Look at what the cat dragged in +str(a) + +# If you have a date column, you'll want to fix it up like this: +a$date <- as.Date(as.character(a$X), format="%d-%b-%y") +a$X <- NULL + + +# Also see http://tolstoy.newcastle.edu.au/R/help/06/04/25674.html for +# another path. \ No newline at end of file diff --git a/r/Reading in a file made by CMIE's Business Beacon program.r b/r/Reading in a file made by CMIE's Business Beacon program.r new file mode 100644 index 0000000..119acbd --- /dev/null +++ b/r/Reading in a file made by CMIE's Business Beacon program.r @@ -0,0 +1,17 @@ +# Goal: To read in files produced by CMIE's "Business Beacon". +# This assumes you have made a file of MONTHLY data using CMIE's +# Business Beacon program. This contains 2 columns: M3 and M0. + +A <- read.table( + # Generic to all BB files -- + sep="|", # CMIE's .txt file is pipe delimited + skip=3, # Skip the 1st 3 lines + na.strings=c("N.A.","Err"), # The ways they encode missing data + # Specific to your immediate situation -- + file="bb_data.text", + col.names=c("junk", "date", "M3", "M0") + ) +A$junk <- NULL # Blow away this column + +# Parse the CMIE-style "Mmm yy" date string that's used on monthly data +A$date <- as.Date(paste("1", as.character(A$date)), format="%d %b %Y") \ No newline at end of file diff --git a/r/Sample From a Population.r b/r/Sample From a Population.r new file mode 100644 index 0000000..a3030b9 --- /dev/null +++ b/r/Sample From a Population.r @@ -0,0 +1,16 @@ +> # sample with replacement +> sample(x, replace = TRUE) +[1] 15 17 13 9 5 15 11 15 1 + +> # if we simply pass in a positive number n, it will sample +> # from 1:n without replacement +> sample(10) + [1] 2 4 7 9 1 3 10 5 8 6 + + + + + An example to simulate a coin toss for 10 times. + +> sample(c("H","T"),10, replace = TRUE) + [1] "H" "H" "H" "T" "H" "T" "H" "H" "H" "T" \ No newline at end of file diff --git a/r/Scare the hell out of children with the Cauchy distribution.r b/r/Scare the hell out of children with the Cauchy distribution.r new file mode 100644 index 0000000..5b00a73 --- /dev/null +++ b/r/Scare the hell out of children with the Cauchy distribution.r @@ -0,0 +1,40 @@ +# Goals: Scare the hell out of children with the Cauchy distribution. + +# A function which simulates N draws from one of two distributions, +# and returns the mean obtained thusly. +one.simulation <- function(N=100, distribution="normal") { + if (distribution == "normal") { + x <- rnorm(N) + } else { + x <- rcauchy(N) + } + mean(x) +} + +k1 <- density(replicate(1000, one.simulation(20))) +k2 <- density(replicate(1000, one.simulation(20, distribution="cauchy"))) + +xrange <- range(k1$x, k2$x) +plot(k1$x, k1$y, xlim=xrange, type="l", xlab="Estimated value", ylab="") +grid() +lines(k2$x, k2$y, col="red") +abline(v=.5) +legend(x="topleft", bty="n", + lty=c(1,1), + col=c("black", "red"), + legend=c("Mean of Normal", "Mean of Cauchy")) +# The distribution of the mean of normals collapses into a point; +# that of the cauchy does not. + +# Here's more scary stuff -- +for (i in 1:10) { + cat("Sigma of distribution of 1000 draws from mean of normal - ", + sd(replicate(1000, one.simulation(20))), "\n") +} +for (i in 1:10) { + cat("Sigma of distribution of 1000 draws from mean of cauchy - ", + sd(replicate(1000, one.simulation(20, distribution="cauchy"))), "\n") +} + +# Exercise for the reader: Compare the distribution of the median of +# the Normal against the distribution of the median of the Cauchy. \ No newline at end of file diff --git a/r/Sending an R data object to someone else, either in email or as a binary file.r b/r/Sending an R data object to someone else, either in email or as a binary file.r new file mode 100644 index 0000000..1070990 --- /dev/null +++ b/r/Sending an R data object to someone else, either in email or as a binary file.r @@ -0,0 +1,33 @@ +# Goals: Lots of times, you need to give an R object to a friend, +# or embed data into an email. + +# First I invent a little dataset -- +set.seed(101) # To make sure you get the same random numbers as me + # FYI -- runif(10) yields 10 U(0,1) random numbers. +A = data.frame(x1=runif(10), x2=runif(10), x3=runif(10)) +# Look at it -- +print(A) + +# Writing to a binary file that can be transported +save(A, file="/tmp/my_data_file.rda") # You can give this file to a friend +load("/tmp/my_data_file.rda") + +# Plan B - you want pure ascii, which can be put into an email -- +dput(A) +# This gives you a block of R code. Let me utilise that generated code +# to create a dataset named "B". +B <- structure(list(x1 = c(0.372198376338929, 0.0438248154241592, +0.709684018278494, 0.657690396532416, 0.249855723232031, 0.300054833060130, +0.584866625955328, 0.333467143354937, 0.622011963743716, 0.54582855431363 +), x2 = c(0.879795730113983, 0.706874740775675, 0.731972594512627, +0.931634427979589, 0.455120594473556, 0.590319729177281, 0.820436094887555, +0.224118480458856, 0.411666829371825, 0.0386105608195066), x3 = c(0.700711545301601, +0.956837461562827, 0.213352001970634, 0.661061500199139, 0.923318882007152, +0.795719761401415, 0.0712125543504953, 0.389407767681405, 0.406451216200367, +0.659355078125373)), .Names = c("x1", "x2", "x3"), row.names = c("1", +"2", "3", "4", "5", "6", "7", "8", "9", "10"), class = "data.frame") + +# Verify that A and B are near-identical -- +A-B +# or, +all.equal(A,B) \ No newline at end of file diff --git a/r/Show recessions using filled colour in a macro time-series plot.r b/r/Show recessions using filled colour in a macro time-series plot.r new file mode 100644 index 0000000..55f2455 --- /dev/null +++ b/r/Show recessions using filled colour in a macro time-series plot.r @@ -0,0 +1,18 @@ +# Goal: Display of a macroeconomic time-series, with a filled colour +# bar showing a recession. + +years <- 1950:2000 +timeseries <- cumsum(c(100, runif(50)*5)) +hilo <- range(timeseries) +plot(years, timeseries, type="l", lwd=3) +# A recession from 1960 to 1965 -- +polygon(x=c(1960,1960, 1965,1965), + y=c(hilo, rev(hilo)), + density=NA, col="orange", border=NA) +lines(years, timeseries, type="l", lwd=3) # paint again so line comes on top + +# alternative method -- though not as good looking -- +# library(plotrix) +# gradient.rect(1960, hilo[1], 1965, hilo[2], +# reds=c(0,1), greens=c(0,0), blues=c(0,0), +# gradient="y") \ No newline at end of file diff --git a/r/Show the efficiency of the mean when compared with the median.r b/r/Show the efficiency of the mean when compared with the median.r new file mode 100644 index 0000000..658c5c8 --- /dev/null +++ b/r/Show the efficiency of the mean when compared with the median.r @@ -0,0 +1,26 @@ +# Goal: Show the efficiency of the mean when compared with the median +# using a large simulation where both estimators are applied on +# a sample of U(0,1) uniformly distributed random numbers. + +one.simulation <- function(N=100) { # N defaults to 100 if not supplied + x <- runif(N) + return(c(mean(x), median(x))) +} + +# Simulation -- +results <- replicate(100000, one.simulation(20)) # Gives back a 2x100000 matrix + +# Two kernel densities -- +k1 <- density(results[1,]) # results[1,] is the 1st row +k2 <- density(results[2,]) + +# A pretty picture -- +xrange <- range(k1$x, k2$x) +plot(k1$x, k1$y, xlim=xrange, type="l", xlab="Estimated value", ylab="") +grid() +lines(k2$x, k2$y, col="red") +abline(v=.5) +legend(x="topleft", bty="n", + lty=c(1,1), + col=c("black", "red"), + legend=c("Mean", "Median")) \ No newline at end of file diff --git a/r/Simple Calculator in R.r b/r/Simple Calculator in R.r new file mode 100644 index 0000000..9cfd028 --- /dev/null +++ b/r/Simple Calculator in R.r @@ -0,0 +1,48 @@ +Simple Calculator in R +# Program make a simple calculator +# that can add, subtract, multiply +# and divide using functions + +add <- function(x, y) { + return(x + y) +} + +subtract <- function(x, y) { + return(x - y) +} + +multiply <- function(x, y) { + return(x * y) +} + +divide <- function(x, y) { + return(x / y) +} + +# take input from the user +print("Select operation.") +print("1.Add") +print("2.Subtract") +print("3.Multiply") +print("4.Divide") + +choice = as.integer(readline(prompt="Enter choice[1/2/3/4]: ")) + +num1 = as.integer(readline(prompt="Enter first number: ")) +num2 = as.integer(readline(prompt="Enter second number: ")) + +operator <- switch(choice,"+","-","*","/") +result <- switch(choice, add(num1, num2), subtract(num1, num2), multiply(num1, num2), divide(num1, num2)) + +print(paste(num1, operator, num2, "=", result)) +Output + +[1] "Select operation." +[1] "1.Add" +[1] "2.Subtract" +[1] "3.Multiply" +[1] "4.Divide" +Enter choice[1/2/3/4]: 4 +Enter first number: 20 +Enter second number: 4 +[1] "20 / 4 = 5" \ No newline at end of file diff --git a/r/Simulation to study size and power in a simple problem.r b/r/Simulation to study size and power in a simple problem.r new file mode 100644 index 0000000..7ab43c8 --- /dev/null +++ b/r/Simulation to study size and power in a simple problem.r @@ -0,0 +1,53 @@ +# Goal: Simulation to study size and power in a simple problem. + +set.seed(101) + +# The data generating process: a simple uniform distribution with stated mean +dgp <- function(N,mu) {runif(N)-0.5+mu} + +# Simulate one FIXED hypothesis test for H0:mu=0, given a true mu for a sample size N +one.test <- function(N, truemu) { + x <- dgp(N,truemu) + muhat <- mean(x) + s <- sd(x)/sqrt(N) + # Under the null, the distribution of the mean has standard error s + threshold <- 1.96*s + (muhat < -threshold) || (muhat > threshold) +} # Return of TRUE means reject the null + +# Do one experiment, where the fixed H0:mu=0 is run Nexperiments times with a sample size N. +# We return only one number: the fraction of the time that H0 is rejected. +experiment <- function(Nexperiments, N, truemu) { + sum(replicate(Nexperiments, one.test(N, truemu)))/Nexperiments +} + +# Measure the size of a test, i.e. rejections when H0 is true +experiment(10000, 50, 0) +# Measurement with sample size of 50, and true mu of 0. + +# Power study: I.e. Pr(rejection) when H0 is false +# (one special case in here is when the H0 is actually true) + +muvalues <- seq(-.15,.15,.01) + # When true mu < -0.15 and when true mu > 0.15, + # the Pr(rejection) veers to 1 (full power) and it's not interesting. + +# First do this with sample size of 50 +results <- NULL +for (truth in muvalues) { + results <- c(results, experiment(10000, 50, truth)) +} +par(mai=c(.8,.8,.2,.2)) +plot(muvalues, results, type="l", lwd=2, ylim=c(0,1), + xlab="True mu", ylab="Pr(Rejection of H0:mu=0)") +abline(h=0.05, lty=2) + +# Now repeat this with sample size of 100 (should yield a higher power) +results <- NULL +for (truth in muvalues) { + results <- c(results, experiment(10000, 100, truth)) +} +lines(muvalues, results, lwd=2, col="blue") +legend(x=-0.15, y=.2, lwd=c(2,1,2), lty=c(1,2,1), cex=.8, + col=c("black","black","blue"), bty="n", + legend=c("N=50", "Size, 0.05", "N=100")) \ No newline at end of file diff --git a/r/Sort a Vector.r b/r/Sort a Vector.r new file mode 100644 index 0000000..5d503ba --- /dev/null +++ b/r/Sort a Vector.r @@ -0,0 +1,25 @@ +Sort a Vector +> x +[1] 7 1 8 3 2 6 5 2 2 4 + +> # sort in ascending order +> sort(x) +[1] 1 2 2 2 3 4 5 6 7 8 + +> # sort in descending order +> sort(x, decreasing=TRUE) +[1] 8 7 6 5 4 3 2 2 2 1 + +> # vector x remains unaffected +> x +[1] 7 1 8 3 2 6 5 2 2 4 +Sometimes we would want the index of the sorted vector instead of the values. In such case we can use the function order(). + +> order(x) +[1] 2 5 8 9 4 10 7 6 1 3 + +> order(x, decreasing=TRUE) +[1] 3 1 6 7 10 4 5 8 9 2 + +> x[order(x)] # this will also sort x +[1] 1 2 2 2 3 4 5 6 7 8 \ No newline at end of file diff --git a/r/Special cases in reading files.r b/r/Special cases in reading files.r new file mode 100644 index 0000000..3e7d0e0 --- /dev/null +++ b/r/Special cases in reading files.r @@ -0,0 +1,22 @@ +# Goal: Special cases in reading files + +# Reading in a .bz2 file -- +read.table(bzfile("file.text.bz2")) # Requires you have ./file.text.bz2 + +# Reading in a .gz file -- +read.table(gzfile("file.text.gz")) # Requires you have ./file.text.bz2 + +# Reading from a pipe -- +mydata <- read.table(pipe("awk -f filter.awk input.txt")) + +# Reading from a URL -- +read.table(url("http://www.mayin.org/ajayshah/A/demo.text")) + +# This also works -- +read.table("http://www.mayin.org/ajayshah/A/demo.text") + +# Hmm, I couldn't think of how to read a .bz2 file from a URL. How about: +read.table(pipe("links -source http://www.mayin.org/ajayshah/A/demo.text.bz2 | bunzip2")) + +# Reading binary files from a URL -- +load(url("http://www.mayin.org/ajayshah/A/nifty_weekly_returns.rda")) \ No newline at end of file diff --git a/r/Standard computations with well-studied distributions.r b/r/Standard computations with well-studied distributions.r new file mode 100644 index 0000000..4202d7a --- /dev/null +++ b/r/Standard computations with well-studied distributions.r @@ -0,0 +1,30 @@ +# Goal: Standard computations with well-studied distributions. + +# The normal distribution is named "norm". With this, we have: + +# Normal density +dnorm(c(-1.96,0,1.96)) + +# Cumulative normal density +pnorm(c(-1.96,0,1.96)) + +# Inverse of this +qnorm(c(0.025,.5,.975)) +pnorm(qnorm(c(0.025,.5,.975))) + +# 1000 random numbers from the normal distribution +summary(rnorm(1000)) + + +# Here's the same ideas, for the chi-squared distribution with 10 degrees +# of freedom. +dchisq(c(0,5,10), df=10) + +# Cumulative normal density +pchisq(c(0,5,10), df=10) + +# Inverse of this +qchisq(c(0.025,.5,.975), df=10) + +# 1000 random numbers from the normal distribution +summary(rchisq(1000, df=10)) \ No newline at end of file diff --git a/r/Standard tests.r b/r/Standard tests.r new file mode 100644 index 0000000..78a80bf --- /dev/null +++ b/r/Standard tests.r @@ -0,0 +1,24 @@ +# Goal: Some of the standard tests + +# A classical setting -- +x <- runif(100, 0, 10) # 100 draws from U(0,10) +y <- 2 + 3*x + rnorm(100) # beta = [2, 3] and sigma is 1 +d <- lm(y ~ x) +# CLS results -- +summary(d) + +library(sandwich) +library(lmtest) +# Durbin-Watson test -- +dwtest(d, alternative="two.sided") +# Breusch-Pagan test -- +bptest(d) +# Heteroscedasticity and autocorrelation consistent (HAC) tests +coeftest(d, vcov=kernHAC) + +# Tranplant the HAC values back in -- +library(xtable) +sum.d <- summary(d) +xtable(sum.d) +sum.d$coefficients[1:2,1:4] <- coeftest(d, vcov=kernHAC)[1:2,1:4] +xtable(sum.d) \ No newline at end of file diff --git a/r/Take input from user.r b/r/Take input from user.r new file mode 100644 index 0000000..8201559 --- /dev/null +++ b/r/Take input from user.r @@ -0,0 +1,13 @@ +my.name <- readline(prompt="Enter name: ") +my.age <- readline(prompt="Enter age: ") + +# convert character into integer +my.age <- as.integer(my.age) + +print(paste("Hi,", my.name, "next year you will be", my.age+1, "years old.")) + +Output + +Enter name: Mary +Enter age: 17 +[1] "Hi, Mary next year you will be 18 years old." \ No newline at end of file diff --git a/r/The amazing R vector notation.r b/r/The amazing R vector notation.r new file mode 100644 index 0000000..282a34b --- /dev/null +++ b/r/The amazing R vector notation.r @@ -0,0 +1,116 @@ +# Goal: The amazing R vector notation. + +cat("EXAMPLE 1: sin(x) for a vector --\n") +# Suppose you have a vector x -- +x = c(0.1,0.6,1.0,1.5) + +# The bad way -- +n = length(x) +r = numeric(n) +for (i in 1:n) { + r[i] = sin(x[i]) +} +print(r) + +# The good way -- don't use loops -- +print(sin(x)) + + +cat("\n\nEXAMPLE 2: Compute the mean of every row of a matrix --\n") +# Here's another example. It isn't really about R; it's about thinking in +# matrix notation. But still. +# Let me setup a matrix -- +N=4; M=100; +r = matrix(runif(N*M), N, M) + +# So I face a NxM matrix +# [r11 r12 ... r1N] +# [r21 r22 ... r2N] +# [r32 r32 ... r3N] +# My goal: each column needs to be reduced to a mean. + +# Method 1 uses loops: +mean1 = numeric(M) +for (i in 1:M) { + mean1[i] = mean(r[,i]) +} + +# Alternatively, just say: +mean2 = rep(1/N, N) %*% r # Pretty! + +# The two answers are the same -- +all.equal(mean1,mean2[,]) +# +# As an aside, I should say that you can do this directly by using +# the rowMeans() function. But the above is more about pedagogy rather +# than showing you how to get rowmeans. + + +cat("\n\nEXAMPLE 3: Nelson-Siegel yield curve\n") +# Write this asif you're dealing with scalars -- +# Nelson Siegel function +nsz <- function(b0, b1, b2, tau, t) { + tmp = t/tau + tmp2 = exp(-tmp) + return(b0 + ((b1+b2)*(1-tmp2)/(tmp)) - (b2*tmp2)) +} + +timepoints <- c(0.01,1:5) + +# The bad way: +z <- numeric(length(timepoints)) +for (i in 1:length(timepoints)) { + z[i] <- nsz(14.084,-3.4107,0.0015,1.8832,timepoints[i]) +} +print(z) + +# The R way -- +print(z <- nsz(14.084,-3.4107,0.0015,1.8832,timepoints)) + + +cat("\n\nEXAMPLE 3: Making the NPV of a bond--\n") +# You know the bad way - sum over all cashflows, NPVing each. +# Now look at the R way. +C = rep(100, 6) +nsz(14.084,-3.4107,0.0015,1.8832,timepoints) # Print interest rates +C/((1.05)^timepoints) # Print cashflows discounted @ 5% +C/((1 + (0.01*nsz(14.084,-3.4107,0.0015,1.8832,timepoints))^timepoints)) # Using NS instead of 5% +# NPV in two different ways -- +C %*% (1 + (0.01*nsz(14.084,-3.4107,0.0015,1.8832,timepoints)))^-timepoints +sum(C * (1 + (0.01*nsz(14.084,-3.4107,0.0015,1.8832,timepoints)))^-timepoints) +# You can drop back to a flat yield curve at 5% easily -- +sum(C * 1.05^-timepoints) + +# Make a function for NPV -- +npv <- function(C, timepoints, r) { + return(sum(C * (1 + (0.01*r))^-timepoints)) +} +npv(C, timepoints, 5) + +# Bottom line: Here's how you make the NPV of a bond with cashflows C +# at timepoints timepoints when the zero curve is a Nelson-Siegel curve -- +npv(C, timepoints, nsz(14.084,-3.4107,0.0015,1.8832,timepoints)) +# Wow! + +# --------------------------------------------------------------------------- +# Elegant vector notation is amazingly fast (in addition to being beautiful) +N <- 1e5 +x <- runif(N, -3,3) +y <- runif(N) + +method1 <- function(x,y) { + tmp <- NULL + for (i in 1:N) { + if (x[i] < 0) tmp <- c(tmp, y[i]) + } + tmp +} + +method2 <- function(x,y) { + y[x < 0] +} + +s1 <- system.time(ans1 <- method1(x,y)) +s2 <- system.time(ans2 <- method2(x,y)) +all.equal(ans1,ans2) +s1/s2 # On my machine it's 2000x faster \ No newline at end of file diff --git a/r/To do OLS by MLE.r b/r/To do OLS by MLE.r new file mode 100644 index 0000000..5662e52 --- /dev/null +++ b/r/To do OLS by MLE.r @@ -0,0 +1,66 @@ +# Goal: To do OLS by MLE. + +# OLS likelihood function +# Note: I am going to write the LF using sigma2=sigma^2 and not sigma. +ols.lf1 <- function(theta, y, X) { + beta <- theta[-1] + sigma2 <- theta[1] + if (sigma2 <= 0) return(NA) + n <- nrow(X) + e <- y - X%*%beta # t() = matrix transpose + logl <- ((-n/2)*log(2*pi)) - ((n/2)*log(sigma2)) - ((t(e)%*%e)/(2*sigma2)) + return(-logl) # since optim() does minimisation by default. +} + +# Analytical derivatives +ols.gradient <- function(theta, y, X) { + beta <- theta[-1] + sigma2 <- theta[1] + e <- y - X%*%beta + n <- nrow(X) + + g <- numeric(length(theta)) + g[1] <- (-n/(2*sigma2)) + (t(e)%*%e)/(2*sigma2*sigma2) # d logl / d sigma + g[-1] <- (t(X) %*% e)/sigma2 # d logl / d beta + + return(-g) +} + +X <- cbind(1, runif(1000)) +theta.true <- c(2,4,6) # error variance = 2, intercept = 4, slope = 6. +y <- X %*% theta.true[-1] + sqrt(theta.true[1]) * rnorm(1000) + +# Estimation by OLS -- +d <- summary(lm(y ~ X[,2])) +theta.ols <- c(sigma2 = d$sigma^2, d$coefficients[,1]) +cat("OLS theta = ", theta.ols, "\n\n") + +cat("\nGradient-free (constrained optimisation) --\n") +optim(c(1,1,1), method="L-BFGS-B", fn=ols.lf1, + lower=c(1e-6,-Inf,-Inf), upper=rep(Inf,3), y=y, X=X) + +cat("\nUsing the gradient (constrained optimisation) --\n") +optim(c(1,1,1), method="L-BFGS-B", fn=ols.lf1, gr=ols.gradient, + lower=c(1e-6,-Inf,-Inf), upper=rep(Inf,3), y=y, X=X) + +cat("\n\nYou say you want a covariance matrix?\n") +p <- optim(c(1,1,1), method="L-BFGS-B", fn=ols.lf1, gr=ols.gradient, + lower=c(1e-6,-Inf,-Inf), upper=rep(Inf,3), hessian=TRUE, + y=y, X=X) +inverted <- solve(p$hessian) +results <- cbind(p$par, sqrt(diag(inverted)), p$par/sqrt(diag(inverted))) +colnames(results) <- c("Coefficient", "Std. Err.", "t") +rownames(results) <- c("Sigma", "Intercept", "X") +cat("MLE results --\n") +print(results) +cat("Compare with the OLS results --\n") +d$coefficients + +# Picture of how the loglikelihood changes if you perturb the sigma +theta <- theta.ols +delta.values <- seq(-1.5, 1.5, .01) +logl.values <- as.numeric(lapply(delta.values, + function(x) {-ols.lf1(theta+c(x,0,0),y,X)})) +plot(sqrt(theta[1]+delta.values), logl.values, type="l", lwd=3, col="blue", + xlab="Sigma", ylab="Log likelihood") +grid() \ No newline at end of file diff --git a/r/To do `moving window volatility' of returns.r b/r/To do `moving window volatility' of returns.r new file mode 100644 index 0000000..755bb64 --- /dev/null +++ b/r/To do `moving window volatility' of returns.r @@ -0,0 +1,28 @@ +# Goal: To do `moving window volatility' of returns. + +library(zoo) + +# Some data to play with (Nifty on all fridays for calendar 2004) -- +p <- structure(c(1946.05, 1971.9, 1900.65, 1847.55, 1809.75, 1833.65, 1913.6, 1852.65, 1800.3, 1867.7, 1812.2, 1725.1, 1747.5, 1841.1, 1853.55, 1868.95, 1892.45, 1796.1, 1804.45, 1582.4, 1560.2, 1508.75, 1521.1, 1508.45, 1491.2, 1488.5, 1537.5, 1553.2, 1558.8, 1601.6, 1632.3, 1633.4, 1607.2, 1590.35, 1609, 1634.1, 1668.75, 1733.65, 1722.5, 1775.15, 1820.2, 1795, 1779.75, 1786.9, 1852.3, 1872.95, 1872.35, 1901.05, 1996.2, 1969, 2012.1, 2062.7, 2080.5), index = structure(c(12419, 12426, 12433, 12440, 12447, 12454, 12461, 12468, 12475, 12482, 12489, 12496, 12503, 12510, 12517, 12524, 12531, 12538, 12545, 12552, 12559, 12566, 12573, 12580, 12587, 12594, 12601, 12608, 12615, 12622, 12629, 12636, 12643, 12650, 12657, 12664, 12671, 12678, 12685, 12692, 12699, 12706, 12713, 12720, 12727, 12734, 12741, 12748, 12755, 12762, 12769, 12776, 12783), class = "Date"), frequency = 0.142857142857143, class = c("zooreg", "zoo")) + +# Shift to returns -- +r <- 100*diff(log(p)) +head(r) +summary(r) +sd(r) + +# Compute the moving window vol -- +vol <- sqrt(250) * rollapply(r, 20, sd, align = "right") + +# A pretty plot -- +plot(vol, type="l", ylim=c(0,max(vol,na.rm=TRUE)), + lwd=2, col="purple", xlab="2004", + ylab=paste("Annualised sigma, 20-week window")) +grid() +legend(x="bottomleft", col=c("purple", "darkgreen"), + lwd=c(2,2), bty="n", cex=0.8, + legend=c("Annualised 20-week vol (left scale)", "Nifty (right scale)")) +par(new=TRUE) +plot(p, type="l", lwd=2, col="darkgreen", + xaxt="n", yaxt="n", xlab="", ylab="") +axis(4) \ No newline at end of file diff --git a/r/To do nonlinear regression.r b/r/To do nonlinear regression.r new file mode 100644 index 0000000..eab64e1 --- /dev/null +++ b/r/To do nonlinear regression.r @@ -0,0 +1,58 @@ +# Goal: To do nonlinear regression, in three ways +# By just supplying the function to be fit, +# By also supplying the analytical derivatives, and +# By having him analytically differentiate the function to be fit. +# +# John Fox has a book "An R and S+ companion to applied regression" +# (abbreviated CAR). +# An appendix associated with this book, titled +# "Nonlinear regression and NLS" +# is up on the web, and I strongly recommend that you go read it. +# +# This file is essentially from there (I have made slight changes). + +# First take some data - from the CAR book -- +library(car) +data(US.pop) +attach(US.pop) +plot(year, population, type="l", col="blue") + +# So you see, we have a time-series of the US population. We want to +# fit a nonlinear model to it. + +library(stats) # Contains nonlinear regression +time <- 0:20 +pop.mod <- nls(population ~ beta1/(1 + exp(beta2 + beta3*time)), + start=list(beta1=350, beta2=4.5, beta3=-0.3), trace=TRUE) +# You just write in the formula that you want to fit, and supply +# starting values. "trace=TRUE" makes him show iterations go by. + +summary(pop.mod) +# Add in predicted values into the plot +lines(year, fitted.values(pop.mod), lwd=3, col="red") + +# Look at residuals +plot(year, residuals(pop.mod), type="b") +abline(h=0, lty=2) + +# Using analytical derivatives: +model <- function(beta1, beta2, beta3, time) { + m <- beta1/(1+exp(beta2+beta3*time)) + term <- exp(beta2 + beta3*time) + gradient <- cbind((1+term)^-1, + -beta1*(1+term)^-2 * term, + -beta1*(1+term)^-2 * term * time) + attr(m, 'gradient') <- gradient + return(m) +} + +summary(nls(population ~ model(beta1, beta2, beta3, time), + start=list(beta1=350, beta2=4.5, beta3=-0.3))) + +# Using analytical derivatives, using automatic differentiation (!!!): +model <- deriv(~ beta1/(1 + exp(beta2+beta3*time)), # rhs of model + c('beta1', 'beta2', 'beta3'), # parameter names + function(beta1, beta2, beta3, time){} # arguments for result + ) +summary(nls(population ~ model(beta1, beta2, beta3, time), + start=list(beta1=350, beta2=4.5, beta3=-0.3))) \ No newline at end of file diff --git a/r/To do sorting.r b/r/To do sorting.r new file mode 100644 index 0000000..832a69a --- /dev/null +++ b/r/To do sorting.r @@ -0,0 +1,18 @@ +# Goal: To do sorting. +# +# The approach here needs to be explained. If `i' is a vector of +# integers, then the data frame D[i,] picks up rows from D based +# on the values found in `i'. +# +# The order() function makes an integer vector which is a correct +# ordering for the purpose of sorting. + +D <- data.frame(x=c(1,2,3,1), y=c(7,19,2,2)) +D + +# Sort on x +indexes <- order(D$x) +D[indexes,] + +# Print out sorted dataset, sorted in reverse by y +D[rev(order(D$y)),] \ No newline at end of file diff --git a/r/To make a latex table with results of an OLS regression.r b/r/To make a latex table with results of an OLS regression.r new file mode 100644 index 0000000..e9949d9 --- /dev/null +++ b/r/To make a latex table with results of an OLS regression.r @@ -0,0 +1,35 @@ +# Goal: To make a latex table with results of an OLS regression. + +# Get an OLS -- +x1 = runif(100) +x2 = runif(100, 0, 2) +y = 2 + 3*x1 + 4*x2 + rnorm(100) +m = lm(y ~ x1 + x2) + +# and print it out prettily -- +library(xtable) +# Bare -- +xtable(m) +xtable(anova(m)) + +# Better -- +print.xtable(xtable(m, caption="My regression", + label="t:mymodel", + digits=c(0,3,2,2,3)), + type="latex", + file="xtable_demo_ols.tex", + table.placement = "tp", + latex.environments=c("center", "footnotesize")) + +print.xtable(xtable(anova(m), + caption="ANOVA of my regression", + label="t:anova_mymodel"), + type="latex", + file="xtable_demo_anova.tex", + table.placement = "tp", + latex.environments=c("center", "footnotesize")) + +# Read the documentation of xtable. It actually knows how to generate +# pretty latex tables for a lot more R objects than just OLS results. +# It can be a workhorse for making tabular out of matrices, and +# can also generate HTML. \ No newline at end of file diff --git a/r/To make a panel of pictures.r b/r/To make a panel of pictures.r new file mode 100644 index 0000000..59c2fb4 --- /dev/null +++ b/r/To make a panel of pictures.r @@ -0,0 +1,33 @@ +# Goal: To make a panel of pictures. + +par(mfrow=c(3,2)) # 3 rows, 2 columns. + +# Now the next 6 pictures will be placed on these 6 regions. :-) + +# Let me take some pains on the 1st +plot(density(runif(100)), lwd=2) +text(x=0, y=0.2, "100 uniforms") # Showing you how to place text at will +abline(h=0, v=0) + # All these statements effect the 1st plot. + +x=seq(0.01,1,0.01) +par(col="blue") # default colour to blue. + +# 2 -- +plot(x, sin(x), type="l") +lines(x, cos(x), type="l", col="red") + +# 3 -- +plot(x, exp(x), type="l", col="green") +lines(x, log(x), type="l", col="orange") + +# 4 -- +plot(x, tan(x), type="l", lwd=3, col="yellow") + +# 5 -- +plot(x, exp(-x), lwd=2) +lines(x, exp(x), col="green", lwd=3) + +# 6 -- +plot(x, sin(x*x), type="l") +lines(x, sin(1/x), col="pink") \ No newline at end of file diff --git a/r/To make latex tabular out of an R matrix.r b/r/To make latex tabular out of an R matrix.r new file mode 100644 index 0000000..c3de2fc --- /dev/null +++ b/r/To make latex tabular out of an R matrix.r @@ -0,0 +1,41 @@ +# Goal: To make latex tabular out of an R matrix + +# Setup a nice R object: +m <- matrix(rnorm(8), nrow=2) +rownames(m) <- c("Age", "Weight") +colnames(m) <- c("Person1", "Person2", "Person3", "Person4") +m + +# Translate it into a latex tabular: +library(xtable) +xtable(m, digits=rep(3,5)) + +# Production latex code that goes into a paper or a book -- +print(xtable(m, + caption="String", + label="t:"), + type="latex", + file="blah.gen", + table.placement="tp", + latex.environments=c("center", "footnotesize")) +# Now you do \input{blah.gen} in your latex file. + + + +# You're lazy, and want to use R to generate latex tables for you? +data <- cbind( + c(7,9,11,2), + c(2,4,19,21) + ) +colnames(data) <- c("a","b") +rownames(data) <- c("x","y","z","a") +xtable(data) + +# or you could do +data <- rbind( + c(7,2), + c(9,4), + c(11,19), + c(2,21) + ) +# and the rest goes through identically. \ No newline at end of file diff --git a/r/To read in a simple data file where date data is present.r b/r/To read in a simple data file where date data is present.r new file mode 100644 index 0000000..8227eea --- /dev/null +++ b/r/To read in a simple data file where date data is present.r @@ -0,0 +1,24 @@ +# Goal: To read in a simple data file where date data is present. + +# Suppose you have a file "x.data" which looks like this: +# 1997-07-04,3.1,4 +# 1997-07-05,7.2,19 +# 1997-07-07,1.7,2 +# 1997-07-08,1.1,13 + +A <- read.table("x.data", sep=",", + col.names=c("date", "my1", "my2")) +A$date <- as.Date(A$date, format="%Y-%m-%d") + # Say ?strptime to learn how to use "%" to specify + # other date formats. Two examples -- + # "15/12/2002" needs "%d/%m/%Y" + # "03 Jun 1997" needs "%d %b %Y" + + # Actually, if you're using the ISO 8601 date format, i.e. + # "%Y-%m-%d", that's the default setting and you don't need to + # specify the format. + +A$newcol <- A$my1 + A$my2 # Makes a new column in A +newvar <- A$my1 - A$my2 # Makes a new R object "newvar" +A$my1 <- NULL # Delete the `my1' column +summary(A) # Makes summary statistics \ No newline at end of file diff --git a/r/To read in a simple data file, and look around it's contents.r b/r/To read in a simple data file, and look around it's contents.r new file mode 100644 index 0000000..ffe7cc2 --- /dev/null +++ b/r/To read in a simple data file, and look around it's contents.r @@ -0,0 +1,26 @@ +# Goal: To read in a simple data file, and look around it's contents. + +# Suppose you have a file "x.data" which looks like this: +# 1997,3.1,4 +# 1998,7.2,19 +# 1999,1.7,2 +# 2000,1.1,13 +# To read it in -- + +A <- read.table("x.data", sep=",", + col.names=c("year", "my1", "my2")) +nrow(A) # Count the rows in A + +summary(A$year) # The column "year" in data frame A + # is accessed as A$year + +A$newcol <- A$my1 + A$my2 # Makes a new column in A +newvar <- A$my1 - A$my2 # Makes a new R object "newvar" +A$my1 <- NULL # Removes the column "my1" + +# You might find these useful, to "look around" a dataset -- +str(A) +summary(A) +library(Hmisc) # This requires that you've installed the Hmisc package +contents(A) +describe(A) \ No newline at end of file diff --git a/r/To show amazing R indexing notation, and the use of is.r b/r/To show amazing R indexing notation, and the use of is.r new file mode 100644 index 0000000..b22df8d --- /dev/null +++ b/r/To show amazing R indexing notation, and the use of is.r @@ -0,0 +1,43 @@ +# Goal: To show amazing R indexing notation, and the use of is.na() + +x <- c(2,7,9,2,NA,5) # An example vector to play with. + +# Give me elems 1 to 3 -- +x[1:3] + +# Give me all but elem 1 -- +x[-1] + +# Odd numbered elements -- +indexes <- seq(1,6,2) +x[indexes] +# or, more compactly, +x[seq(1,6,2)] + +# Access elements by specifying "on" / "off" through booleans -- +require <- c(TRUE,TRUE,FALSE,FALSE,FALSE,FALSE) +x[require] +# Short vectors get reused! So, to get odd numbered elems -- +x[c(TRUE,FALSE)] + +# Locate missing data -- +is.na(x) + +# Replace missing data by 0 -- +x[is.na(x)] <- 0 +x + +# Similar ideas work for matrices -- +y <- matrix(c(2,7,9,2,NA,5), nrow=2) +y + +# Make a matrix containing columns 1 and 3 -- +y[,c(1,3)] + +# Let us see what is.na(y) does -- +is.na(y) +str(is.na(y)) +# So is.na(y) gives back a matrix with the identical structure as that of y. +# Hence I can say +y[is.na(y)] <- -1 +y \ No newline at end of file diff --git a/r/To write functions.r b/r/To write functions.r new file mode 100644 index 0000000..13df8cb --- /dev/null +++ b/r/To write functions.r @@ -0,0 +1,42 @@ +# Goals: To write functions +# To write functions that send back multiple objects. + +# FIRST LEARN ABOUT LISTS -- +X = list(height=5.4, weight=54) +print("Use default printing --") +print(X) +print("Accessing individual elements --") +cat("Your height is ", X$height, " and your weight is ", X$weight, "\n") + +# FUNCTIONS -- +square <- function(x) { + return(x*x) +} +cat("The square of 3 is ", square(3), "\n") + + # default value of the arg is set to 5. +cube <- function(x=5) { + return(x*x*x); +} +cat("Calling cube with 2 : ", cube(2), "\n") # will give 2^3 +cat("Calling cube : ", cube(), "\n") # will default to 5^3. + +# LEARN ABOUT FUNCTIONS THAT RETURN MULTIPLE OBJECTS -- +powers <- function(x) { + parcel = list(x2=x*x, x3=x*x*x, x4=x*x*x*x); + return(parcel); +} + +X = powers(3); +print("Showing powers of 3 --"); print(X); + +# WRITING THIS COMPACTLY (4 lines instead of 7) + +powerful <- function(x) { + return(list(x2=x*x, x3=x*x*x, x4=x*x*x*x)); +} +print("Showing powers of 3 --"); print(powerful(3)); + +# In R, the last expression in a function is, by default, what is +# returned. So you could equally just say: +powerful <- function(x) {list(x2=x*x, x3=x*x*x, x4=x*x*x*x)} \ No newline at end of file diff --git a/r/Two CDFs and a two-sample Kolmogorov-Smirnoff test.r b/r/Two CDFs and a two-sample Kolmogorov-Smirnoff test.r new file mode 100644 index 0000000..7e28c0c --- /dev/null +++ b/r/Two CDFs and a two-sample Kolmogorov-Smirnoff test.r @@ -0,0 +1,44 @@ +# Goal: Given two vectors of data, +# superpose their CDFs +# and show the results of the two-sample Kolmogorov-Smirnoff test + +# The function consumes two vectors x1 and x2. +# You have to provide a pair of labels as `legendstrings'. +# If you supply an xlab, it's used +# If you specify log - e.g. log="x" - this is passed on to plot. +# The remaining args that you specify are sent on into ks.test() +two.cdfs.plot <- function(x1, x2, legendstrings, xlab="", log="", ...) { + stopifnot(length(x1)>0, + length(x2)>0, + length(legendstrings)==2) + hilo <- range(c(x1,x2)) + + par(mai=c(.8,.8,.2,.2)) + plot(ecdf(x1), xlim=hilo, verticals=TRUE, cex=0, + xlab=xlab, log=log, ylab="Cum. distribution", main="") + grid() + plot(ecdf(x2), add=TRUE, verticals=TRUE, cex=0, lwd=3) + legend(x="bottomright", lwd=c(1,3), lty=1, bty="n", + legend=legendstrings) + + k <- ks.test(x1,x2, ...) + text(x=hilo[1], y=c(.9,.85), pos=4, cex=.8, + labels=c( + paste("KS test statistic: ", sprintf("%.3g", k$statistic)), + paste("Prob value: ", sprintf("%.3g", k$p.value)) + ) + ) + k +} + +x1 <- rnorm(100, mean=7, sd=1) +x2 <- rnorm(100, mean=9, sd=1) + +# Check error detection -- +two.cdfs.plot(x1,x2) + +# Typical use -- +two.cdfs.plot(x1, x2, c("X1","X2"), xlab="Height (metres)", log="x") + +# Send args into ks.test() -- +two.cdfs.plot(x1, x2, c("X1","X2"), alternative="less") \ No newline at end of file diff --git a/r/Using orthogonal polynomials.r b/r/Using orthogonal polynomials.r new file mode 100644 index 0000000..b645101 --- /dev/null +++ b/r/Using orthogonal polynomials.r @@ -0,0 +1,61 @@ +# Goal: Experiment with fitting nonlinear functional forms in +# OLS, using orthogonal polynomials to avoid difficulties with +# near-singular design matrices that occur with ordinary polynomials. +# Shriya Anand, Gabor Grothendieck, Ajay Shah, March 2006. + +# We will deal with noisy data from the d.g.p. y = sin(x) + e +x <- seq(0, 2*pi, length.out=50) +set.seed(101) +y <- sin(x) + 0.3*rnorm(50) +basicplot <- function(x, y, minx=0, maxx=3*pi, title="") { + plot(x, y, xlim=c(minx,maxx), ylim=c(-2,2), main=title) + lines(x, sin(x), col="blue", lty=2, lwd=2) + abline(h=0, v=0) +} +x.outsample <- seq(0, 3*pi, length.out=100) + +# Severe multicollinearity with ordinary polynomials +x2 <- x*x +x3 <- x2*x +x4 <- x3*x +cor(cbind(x, x2, x3, x4)) +# and a perfect design matrix using orthogonal polynomials +m <- poly(x, 4) +all.equal(cor(m), diag(4)) # Correlation matrix is I. + +par(mfrow=c(2,2)) +# Ordinary polynomial regression -- + p <- lm(y ~ x + I(x^2) + I(x^3) + I(x^4)) + summary(p) + basicplot(x, y, title="Polynomial, insample") # Data + lines(x, fitted(p), col="red", lwd=3) # In-sample + basicplot(x, y, title="Polynomial, out-of-sample") + predictions.p <- predict(p, list(x = x.outsample)) # Out-of-sample + lines(x.outsample, predictions.p, type="l", col="red", lwd=3) + lines(x.outsample, sin(x.outsample), type="l", col="blue", lwd=2, lty=2) + # As expected, polynomial fitting gives terrible results out of sample. + +# These IDENTICAL things using orthogonal polynomials + d <- lm(y ~ poly(x, 4)) + summary(d) + basicplot(x, y, title="Orth. poly., insample") # Data + lines(x, fitted(d), col="red", lwd=3) # In-sample + basicplot(x, y, title="Orth. poly., out-of-sample") + predictions.op <- predict(d, list(x = x.outsample)) # Out-of-sample + lines(x.outsample, predictions.op, type="l", col="red", lwd=3) + lines(x.outsample, sin(x.outsample), type="l", col="blue", lwd=2, lty=2) + +# predict(d) is magical! See ?SafePrediction +# The story runs at two levels. First, when you do an OLS model, +# predict()ion requires applying coefficients to an appropriate +# X matrix. But one level deeper, the polynomial or orthogonal-polynomial +# needs to be utilised for computing the X matrix based on the +# supplied x.outsample data. +# If you say p <- poly(x, n) +# then you can say predict(p, new) where predict.poly() gets invoked. +# And when you say predict(lm()), the full steps are worked out for +# you automatically: predict.poly() is used to make an X matrix and +# then prediction based on the regression results is done. + +all.equal(predictions.p, predictions.op) # Both paths are identical for this + # (tame) problem. \ No newline at end of file diff --git a/r/Utilise matrix notation.r b/r/Utilise matrix notation.r new file mode 100644 index 0000000..78dd001 --- /dev/null +++ b/r/Utilise matrix notation.r @@ -0,0 +1,26 @@ +# Goal: Utilise matrix notation +# We use the problems of portfolio analysis as an example. + +# Prices of 4 firms to play with, at weekly frequency (for calendar 2004) -- +p <- structure(c(300.403, 294.604, 291.038, 283.805, 270.773, 275.506, 292.271, 292.837, 284.872, 295.037, 280.939, 259.574, 250.608, 268.84, 266.507, 263.94, 273.173, 238.609, 230.677, 192.847, 219.078, 201.846, 210.279, 193.281, 186.748, 197.314, 202.813, 204.08, 226.044, 242.442, 261.274, 269.173, 256.05, 259.75, 243, 250.3, 263.45, 279.5, 289.55, 291.95, 302.1, 284.4, 283.5, 287.8, 298.3, 307.6, 307.65, 311.9, 327.7, 318.1, 333.6, 358.9, 385.1, 53.6, 51.95, 47.65, 44.8, 44.85, 44.3, 47.1, 44.2, 41.8, 41.9, 41, 35.3, 33.35, 35.6, 34.55, 35.55, 40.05, 35, 34.85, 28.95, 31, 29.25, 29.05, 28.95, 24.95, 26.15, 28.35, 29.4, 32.55, 37.2, 39.85, 40.8, 38.2, 40.35, 37.55, 39.4, 39.8, 43.25, 44.75, 47.25, 49.6, 47.6, 46.35, 49.4, 49.5, 50.05, 50.5, 51.85, 56.35, 54.15, 58, 60.7, 62.7, 293.687, 292.746, 283.222, 286.63, 259.774, 259.257, 270.898, 250.625, 242.401, 248.1, 244.942, 239.384, 237.926, 224.886, 243.959, 270.998, 265.557, 257.508, 258.266, 257.574, 251.917, 250.583, 250.783, 246.6, 252.475, 266.625, 263.85, 249.925, 262.9, 264.975, 273.425, 275.575, 267.2, 282.25, 284.25, 290.75, 295.625, 296.25, 291.375, 302.225, 318.95, 324.825, 320.55, 328.75, 344.05, 345.925, 356.5, 368.275, 374.825, 373.525, 378.325, 378.6, 374.4, 1416.7, 1455.15, 1380.97, 1365.31, 1303.2, 1389.64, 1344.05, 1266.29, 1265.61, 1312.17, 1259.25, 1297.3, 1327.38, 1250, 1328.03, 1347.46, 1326.79, 1286.54, 1304.84, 1272.44, 1227.53, 1264.44, 1304.34, 1277.65, 1316.12, 1370.97, 1423.35, 1382.5, 1477.75, 1455.15, 1553.5, 1526.8, 1479.85, 1546.8, 1565.3, 1606.6, 1654.05, 1689.7, 1613.95, 1703.25, 1708.05, 1786.75, 1779.75, 1906.35, 1976.6, 2027.2, 2057.85, 2029.6, 2051.35, 2033.4, 2089.1, 2065.2, 2091.7), .Dim = c(53, 4), .Dimnames = list(NULL, c("TISCO", "SAIL", "Wipro", "Infosys"))) +# Shift from prices to returns -- +r <- 100*diff(log(p)) + +# Historical expected returns -- +colMeans(r) +# Historical correlation matrix -- +cor(r) +# Historical covariance matrix -- +S <- cov(r) +S + +# Historical portfolio variance for a stated portfolio of 20%,20%,30%,30% -- +w <- c(.2, .2, .3, .3) +t(w) %*% S %*% w + +# The portfolio optimisation function in tseries -- +library(tseries) +optimised <- portfolio.optim(r) # This uses the historical facts from r +optimised$pw # Weights +optimised$pm # Expected return using these weights +optimised$ps # Standard deviation of optimised port. \ No newline at end of file diff --git a/r/Vector Elements Arithmetic.r b/r/Vector Elements Arithmetic.r new file mode 100644 index 0000000..813a94c --- /dev/null +++ b/r/Vector Elements Arithmetic.r @@ -0,0 +1,17 @@ +> sum(2,7,5) +[1] 14 + +> x +[1] 2 NA 3 1 4 + +> sum(x) # if any element is NA or NaN, result is NA or NaN +[1] NA + +> sum(x, na.rm=TRUE) # this way we can ignore NA and NaN values +[1] 10 + +> mean(x, na.rm=TRUE) +[1] 2.5 + +> prod(x, na.rm=TRUE) +[1] 24 \ No newline at end of file diff --git a/r/z=f(x,y) using contour lines and colours.r b/r/z=f(x,y) using contour lines and colours.r new file mode 100644 index 0000000..ce6982b --- /dev/null +++ b/r/z=f(x,y) using contour lines and colours.r @@ -0,0 +1,41 @@ +# Goal: Visualisation of 3-dimensional (x,y,z) data using contour +# plots and using colour to represent the 3rd dimension. +# The specific situation is: On a grid of (x,y) points, you have +# evaluated f(x,y). Now you want a graphical representation of +# the resulting list of (x,y,z) points that you have. + +# Setup an interesting data matrix of (x,y,z) points: +points <- structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0.998, 0.124, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0.998, 0.71, 0.068, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0.998, 0.898, 0.396, 0.058, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0.998, 0.97, 0.726, 0.268, 0.056, 0.006, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0.996, 0.88, 0.546, 0.208, 0.054, 0.012, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.998, 0.964, 0.776, 0.418, 0.18, 0.054, 0.014, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.998, 0.906, 0.664, 0.342, 0.166, 0.056, 0.018, 0.006, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.986, 0.862, 0.568, 0.29, 0.15, 0.056, 0.022, 0.008, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.954, 0.778, 0.494, 0.26, 0.148, 0.056, 0.024, 0.012, 0.004, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.906, 0.712, 0.43, 0.242, 0.144, 0.058, 0.028, 0.012, 0.006, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.878, 0.642, 0.38, 0.222, 0.142, 0.066, 0.034, 0.014, 0.008, 0.004, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0.846, 0.586, 0.348, 0.208, 0.136, 0.068, 0.034, 0.016, 0.012, 0.006, 0.004, 0.002, 0, 0, 0, 0, 0, 0, 0, 0.8, 0.538, 0.318, 0.204, 0.136, 0.07, 0.046, 0.024, 0.012, 0.008, 0.004, 0.002, 0.002, 0, 0, 0, 0, 0, 0, 0.762, 0.496, 0.294, 0.2, 0.138, 0.072, 0.05, 0.024, 0.014, 0.012, 0.006, 0.004, 0.002, 0.002, 0, 0, 0, 0, 0, 0.704, 0.472, 0.286, 0.198, 0.138, 0.074, 0.054, 0.028, 0.016, 0.012, 0.008, 0.006, 0.004, 0.002, 0.002, 0, 0, 0, 0, 0.668, 0.438, 0.276, 0.196, 0.138, 0.078, 0.054, 0.032, 0.024, 0.014, 0.012, 0.008, 0.004, 0.004, 0.002, 0.002, 0, 0, 0, 0.634, 0.412, 0.27, 0.194, 0.14, 0.086, 0.056, 0.032, 0.024, 0.016, 0.012, 0.01, 0.006, 0.004, 0.004, 0.002, 0.002, 0, 0, 0.604, 0.388, 0.26, 0.19, 0.144, 0.088, 0.058, 0.048, 0.026, 0.022, 0.014, 0.012, 0.008, 0.006, 0.004, 0.004, 0.002, 0.002, 0, 0.586, 0.376, 0.256, 0.19, 0.146, 0.094, 0.062, 0.052, 0.028, 0.024, 0.014, 0.012, 0.012, 0.008, 0.004, 0.004, 0.004, 0.002, 0.002, 0.566, 0.364, 0.254, 0.192, 0.148, 0.098, 0.064, 0.054, 0.032, 0.024, 0.022, 0.014, 0.012, 0.012, 0.008, 0.004, 0.004, 0.004, 0.002), .Dim = c(399, 3), .Dimnames = list(NULL, c("x", "y", "z"))) + +# Understand this object -- +summary(points) + # x is a grid from 0 to 1 + # y is a grid from 20 to 200 + # z is the interesting object which will be the 3rd dimension. + +# Solution using contourplot() from package 'lattice' +library(lattice) +d3 <- data.frame(points) +contourplot(z ~ x+y, data=d3) +## or nicer +contourplot(z ~ x+y, data=d3, cuts=20, region = TRUE) +## or using logit - transformed z values: +contourplot(qlogis(z) ~ x+y, data=d3, pretty=TRUE, region = TRUE) + +# An interesting alternative is levelplot() +levelplot(z ~ x+y, pretty=TRUE, contour=TRUE, data=d3) + +# There is a contour() function in R. Even though it sounds obvious +# for the purpose, it is a bit hard to use. +# contour() wants 3 inputs: vectors of x and y values, and a matrix of +# z values, where the x values correspond to the rows of z, and the y +# values to the columns. A collection of points like `points' above +# needs to be turned into such a grid. It might sound odd, but contour() +# image() and persp() have used this kind of input for the longest time. +# +# For irregular data, there's an interp function in the akima package +# that can convert from irregular data into the grid format. +# +# The `points' object that I have above - a list of (x,y,z) points - +# fits directly into the mentality of lattice::contourplot() but not +# into the requirements of contour() \ No newline at end of file