Adding R programming language

2019-11-18 14:03:28 +01:00 · 2019-11-18 14:03:28 +01:00 · 026079a47d
commit 026079a47d
parent 599b63599b
70 changed files with 2257 additions and 0 deletions
--- a/frames.r
+++ b/frames.r
@ -0,0 +1,31 @@
+# Goals: A first look at R objects - vectors, lists, matrices, data frames.
+
+# To make vectors "x" "y" "year" and "names"
+x <- c(2,3,7,9)
+y <- c(9,7,3,2)
+year <- 1990:1993
+names <- c("payal", "shraddha", "kritika", "itida")
+# Accessing the 1st and last elements of y --
+y[1]
+y[length(y)]
+
+# To make a list "person" --
+person <- list(name="payal", x=2, y=9, year=1990)
+person
+# Accessing things inside a list --
+person$name
+person$x
+
+# To make a matrix, pasting together the columns "year" "x" and "y"
+# The verb cbind() stands for "column bind"
+cbind(year, x, y)
+
+# To make a "data frame", which is a list of vectors of the same length --
+D <- data.frame(names, year, x, y)
+nrow(D)
+# Accessing one of these vectors
+D$names
+# Accessing the last element of this vector
+D$names[nrow(D)]
+# Or equally,
+D$names[length(D$names)]
--- a/red.r
+++ b/red.r
@ -0,0 +1,11 @@
+# Goal: A histogram with tails shown in red.
+
+# This happened on the R mailing list on 7 May 2004.
+# This is by Martin Maechler <maechler@stat.math.ethz.ch>, who was
+# responding to a slightly imperfect version of this by
+# "Guazzetti Stefano" <Stefano.Guazzetti@ausl.re.it>
+
+x <- rnorm(1000)
+hx <- hist(x, breaks=100, plot=FALSE)
+plot(hx, col=ifelse(abs(hx$breaks) < 1.669, 4, 2))
+         # What is cool is that "col" is supplied a vector.
--- a/forecasting.r
+++ b/forecasting.r
@ -0,0 +1,67 @@
+# Goals: ARMA modeling - estimation, diagnostics, forecasting.
+
+
+# 0. SETUP DATA
+rawdata <- c(-0.21,-2.28,-2.71,2.26,-1.11,1.71,2.63,-0.45,-0.11,4.79,5.07,-2.24,6.46,3.82,4.29,-1.47,2.69,7.95,4.46,7.28,3.43,-3.19,-3.14,-1.25,-0.50,2.25,2.77,6.72,9.17,3.73,6.72,6.04,10.62,9.89,8.23,5.37,-0.10,1.40,1.60,3.40,3.80,3.60,4.90,9.60,18.20,20.60,15.20,27.00,15.42,13.31,11.22,12.77,12.43,15.83,11.44,12.32,12.10,12.02,14.41,13.54,11.36,12.97,10.00,7.20,8.74,3.92,8.73,2.19,3.85,1.48,2.28,2.98,4.21,3.85,6.52,8.16,5.36,8.58,7.00,10.57,7.12,7.95,7.05,3.84,4.93,4.30,5.44,3.77,4.71,3.18,0.00,5.25,4.27,5.14,3.53,4.54,4.70,7.40,4.80,6.20,7.29,7.30,8.38,3.83,8.07,4.88,8.17,8.25,6.46,5.96,5.88,5.03,4.99,5.87,6.78,7.43,3.61,4.29,2.97,2.35,2.49,1.56,2.65,2.49,2.85,1.89,3.05,2.27,2.91,3.94,2.34,3.14,4.11,4.12,4.53,7.11,6.17,6.25,7.03,4.13,6.15,6.73,6.99,5.86,4.19,6.38,6.68,6.58,5.75,7.51,6.22,8.22,7.45,8.00,8.29,8.05,8.91,6.83,7.33,8.52,8.62,9.80,10.63,7.70,8.91,7.50,5.88,9.82,8.44,10.92,11.67)
+
+# Make a R timeseries out of the rawdata: specify frequency & startdate
+gIIP <- ts(rawdata, frequency=12, start=c(1991,4))
+print(gIIP)
+plot.ts(gIIP, type="l", col="blue", ylab="IIP Growth (%)", lwd=2,
+        main="Full data")
+grid()
+
+# Based on this, I decide that 4/1995 is the start of the sensible period.
+gIIP <- window(gIIP, start=c(1995,4))
+print(gIIP)
+plot.ts(gIIP, type="l", col="blue", ylab="IIP Growth (%)", lwd=2,
+        main="Estimation subset")
+grid()
+
+# Descriptive statistics about gIIP
+mean(gIIP); sd(gIIP); summary(gIIP);
+plot(density(gIIP), col="blue", main="(Unconditional) Density of IIP growth")
+acf(gIIP)
+
+
+# 1. ARMA ESTIMATION
+m.ar2 <- arima(gIIP, order = c(2,0,0))
+print(m.ar2)                       # Print it out
+
+
+# 2. ARMA DIAGNOSTICS
+tsdiag(m.ar2)                      # His pretty picture of diagnostics
+## Time series structure in errors
+print(Box.test(m.ar2$residuals, lag=12, type="Ljung-Box"));
+## Sniff for ARCH
+print(Box.test(m.ar2$residuals^2, lag=12, type="Ljung-Box"));
+## Eyeball distribution of residuals
+plot(density(m.ar2$residuals), col="blue", xlim=c(-8,8),
+     main=paste("Residuals of AR(2)"))
+
+
+# 3. FORECASTING
+## Make a picture of the residuals
+plot.ts(m.ar2$residual, ylab="Innovations", col="blue", lwd=2)
+s <- sqrt(m.ar2$sigma2)
+abline(h=c(-s,s), lwd=2, col="lightGray")
+
+p <- predict(m.ar2, n.ahead = 12)         # Make 12 predictions.
+print(p)
+
+## Watch the forecastability decay away from fat values to 0.
+## sd(x) is the naive sigma. p$se is the prediction se.
+gain <- 100*(1-p$se/sd(gIIP))
+plot.ts(gain, main="Gain in forecast s.d.", ylab="Per cent",
+        col="blue", lwd=2)
+
+## Make a pretty picture that puts it all together
+ts.plot(gIIP, p$pred, p$pred-1.96*p$se, p$pred+1.96*p$se,
+        gpars=list(lty=c(1,1,2,2), lwd=c(2,2,1,1),
+          ylab="IIP growth (%)", col=c("blue","red", "red", "red")))
+grid()
+abline(h=mean(gIIP), lty=2, lwd=2, col="lightGray")
+legend(x="bottomleft", cex=0.8, bty="n",
+       lty=c(1,1,2,2), lwd=c(2,1,1,2),
+       col=c("blue", "red", "red", "lightGray"),
+       legend=c("IIP", "AR(2) forecasts", "95% C.I.", "Mean IIP growth"))
--- a/Vectors.r
+++ b/Vectors.r
@ -0,0 +1,16 @@
+> x
+[1] 3 6 8
+> y
+[1] 2 9 0
+
+> x + y
+[1]  5 15  8
+
+> x + 1    # 1 is recycled to (1,1,1)
+[1] 4 7 9
+
+> x + c(1,4)    # (1,4) is recycled to (1,4,1) but warning issued
+[1]  4 10  9
+Warning message:
+In x + c(1, 4) :
+ longer object length is not a multiple of shorter object length
--- a/datasets.r
+++ b/datasets.r
@ -0,0 +1,33 @@
+# Goal: All manner of import and export of datasets.
+
+# Invent a dataset --
+A <- data.frame(
+                name=c("a","b","c"),
+                ownership=c("Case 1","Case 1","Case 2"),
+                listed.at=c("NSE",NA,"BSE"),
+                   # Firm "b" is unlisted.
+                is.listed=c(TRUE,FALSE,TRUE),
+                   # R convention - boolean variables are named "is.something"
+                x=c(2.2,3.3,4.4),
+                date=as.Date(c("2004-04-04","2005-05-05","2006-06-06"))
+              )
+
+# To a spreadsheet through a CSV file --
+write.table(A,file="demo.csv",sep = ",",col.names = NA,qmethod = "double")
+B <- read.table("demo.csv", header = TRUE, sep = ",", row.names = 1)
+
+# To R as a binary file --
+save(A, file="demo.rda")
+load("demo.rda")
+
+# To the Open XML standard for transport for statistical data --
+library(StatDataML)
+writeSDML(A, "/tmp/demo.sdml")
+B <- readSDML("/tmp/demo.sdml")
+
+# To Stata --
+library(foreign)
+write.dta(A, "/tmp/demo.dta")
+B <- read.dta("/tmp/demo.dta")
+
+# foreign::write.foreign() also has a pathway to SAS and SPSS.
--- a/inference.r
+++ b/inference.r
@ -0,0 +1,33 @@
+# Goal: An example of simulation-based inference.
+# This is in the context of testing for time-series dependence in
+# stock market returns data.
+# The code here does the idea of Kim, Nelson, Startz (1991).
+# We want to use the distribution of realworld returns data, without
+# needing assumptions about normality.
+# The null is lack of dependence (i.e. an efficient market).
+# So repeatedly, the data is permuted, and the sample ACF is computed.
+# This gives us the distribution of the ACF under H0: independence, but
+# while using the empirical distribution of the returns data.
+
+# Weekly returns on Nifty, 1/1/2002 to 31/12/2003, 104 weeks of data.
+r <- c(-0.70031182197603, 0.421690133064168, -1.20098072984689, 0.143402360644984, 3.81836537549516, 3.17055939373247, 0.305580301919228, 1.23853814691852, 0.81584795095706, -1.51865139747764, -2.71223626421522, -0.784836480094242, 1.09180041170998, 0.397649587762761, -4.11309534220923, -0.263912425099111, -0.0410144239805454, 1.75756212770972, -2.3335373897992, -2.19228764624217, -3.64578978183987, 1.92535789661354, 3.45782867883164, -2.15532607229374, -0.448039988298987, 1.50124793565896, -1.45871585874362, -2.13459863369767, -6.2128068251802, -1.94482987066289, 0.751294815735637, 1.78244982829590, 1.61567494389745, 1.53557708728931, -1.53557708728931, -0.322061470004265, -2.28394919698225, 0.70399304137414, -2.93580952607737, 2.38125098034425, 0.0617697039252185, -4.14482733720716, 2.04397528093754, 0.576400673606603, 3.43072725191913, 2.96465382864843, 2.89833358015583, 1.85387040058336, 1.52136515035952, -0.637268376944444, 1.75418926224609, -0.804391905851354, -0.861816058320475, 0.576902488444109, -2.84259880663331, -1.35375536139417, 1.49096529042234, -2.05404881010045, 2.86868849528146, -0.258270670200478, -4.4515881438687, -1.73055019137092, 3.04427015714648, -2.94928202352018, 1.62081315773994, -6.83117945164824, -0.962715713711582, -1.75875847071740, 1.50330330252721, -0.0479705789653728, 3.68968303215933, -0.535807567290103, 3.94034871061182, 3.85787174417738, 0.932185956989873, 4.08598654183674, 2.27343783689715, 1.13958830440017, 2.01737201171230, -1.88131458327554, 1.97596267156648, 2.79857144562001, 2.22470306481695, 2.03212951411427, 4.95626853448883, 3.40400972901396, 3.03840139165246, -1.89863129741417, -3.70832135042951, 4.78478922155396, 4.3973589590097, 4.9667050392987, 2.99775078737081, -4.12349101552438, 3.25638269809945, 2.29683376253966, -2.64772825878214, -0.630835277076258, 4.72528848505451, 1.87368447333380, 3.17543946162564, 4.58174427843208, 3.23625985632168, 2.29777651227296)
+
+# The 1st autocorrelation from the sample:
+acf(r, 1, plot=FALSE)$acf[2]
+
+# Obtain 1000 draws from the distribution of the 1st autocorrelation
+# under the null of independence:
+set.seed <- 101
+simulated <- replicate(1000, acf(r[sample(1:104, replace=FALSE)], 1, plot=FALSE)$acf[2])
+# At 95% --
+quantile(simulated, probs=c(.025,.975))
+# At 99% --
+quantile(simulated, probs=c(.005,.995))
+
+# So we can reject the null at 95% but not at 99%.
+
+# A pretty picture.
+plot(density(simulated), col="blue")
+abline(v=0)
+abline(v=quantile(simulated, probs=c(.025,.975)), lwd=2, col="purple")
+abline(v=acf(r, 1, plot=FALSE)$acf[2], lty=2, lwd=4, col="yellow")
--- a/r/Associative
+++ b/r/Associative
@ -0,0 +1,52 @@
+# Goal: Associative arrays (as in awk) or hashes (as in perl).
+#       Or, more generally, adventures in R addressing.
+
+# Here's a plain R vector:
+x <- c(2,3,7,9)
+# But now I tag every elem with labels:
+names(x) <- c("kal","sho","sad","aja")
+# Associative array operations:
+x["kal"] <- 12
+# Pretty printing the entire associative array:
+x
+
+# This works for matrices too:
+m <- matrix(runif(10), nrow=5)
+rownames(m) <- c("violet","indigo","blue","green","yellow")
+colnames(m) <- c("Asia","Africa")
+# The full matrix --
+m
+# Or even better --
+library(xtable)
+xtable(m)
+
+# Now address symbolically --
+m[,"Africa"]
+m["indigo",]
+m["indigo","Africa"]
+
+# The "in" operator, as in awk --
+for (colour in c("yellow", "orange", "red")) {
+  if (colour %in% rownames(m)) {
+    cat("For Africa and ", colour, " we have ", m[colour, "Africa"], "\n")
+  } else {
+    cat("Colour ", colour, " does not exist in the hash.\n")
+  }
+}
+
+# This works for data frames also --
+D <- data.frame(m)
+D
+# Look closely at what happened --
+str(D)                                  # The colours are the rownames(D).
+
+# Operations --
+D$Africa
+D[,"Africa"]
+D["yellow",]
+# or
+subset(D, rownames(D)=="yellow")
+
+colnames(D) <- c("Antarctica","America")
+D
+D$America
--- a/Decimal.r
+++ b/Decimal.r
@ -0,0 +1,16 @@
+Binary to Decimal
+# Program to convert decimal
+# number into binary number
+# using recursive function
+
+convert_to_binary <- function(n) {
+
+    if(n > 1) {
+        convert_to_binary(as.integer(n/2))
+    }
+    cat(n %% 2)
+}
+Output
+
+> convert_to_binary(52)
+110100
--- a/number.r
+++ b/number.r
@ -0,0 +1,25 @@
+ Check Armstrong number
+# take input from the user
+num = as.integer(readline(prompt="Enter a number: "))
+
+# initialize sum
+sum = 0
+
+# find the sum of the cube of each digit
+temp = num
+while(temp > 0) {
+    digit = temp %% 10
+    sum = sum + (digit ^ 3)
+    temp = floor(temp / 10)
+}
+
+# display the result
+if(num == sum) {
+    print(paste(num, "is an Armstrong number"))
+} else {
+    print(paste(num, "is not an Armstrong number"))
+}
+Output 1
+
+Enter a number: 23
+[1] "23 is not an Armstrong number"
--- a/r/Check
+++ b/r/Check
@ -0,0 +1,27 @@
+Check Leap Year
+# Program to check if
+# the input year is
+# a leap year or not
+
+year = as.integer(readline(prompt="Enter a year: "))
+if((year %% 4) == 0) {
+    if((year %% 100) == 0) {
+        if((year %% 400) == 0) {
+            print(paste(year,"is a leap year"))
+        } else {
+            print(paste(year,"is not a leap year"))
+        }
+    } else {
+        print(paste(year,"is a leap year"))
+    }
+} else {
+    print(paste(year,"is not a leap year"))
+}
+Output 1
+
+Enter a year: 1900
+[1] "1900 is not a leap year"
+Output 2
+
+Enter a year: 2000
+[1] "2000 is a leap year"
--- a/Number.r
+++ b/Number.r
@ -0,0 +1,21 @@
+Check Odd and Even Number
+# Program to check if
+# the input number is odd or even.
+# A number is even if division
+# by 2 give a remainder of 0.
+# If remainder is 1, it is odd.
+
+num = as.integer(readline(prompt="Enter a number: "))
+if((num %% 2) == 0) {
+    print(paste(num,"is Even"))
+} else {
+    print(paste(num,"is Odd"))
+}
+Output 1
+
+Enter a number: 89
+[1] "89 is Odd"
+Output 2
+
+Enter a number: 0
+[1] "0 is Even"
--- a/r/Check
+++ b/r/Check
@ -0,0 +1,24 @@
+Check Positive, Negative or Zero
+# In this program, we input a number
+# check if the number is positive or
+# negative or zero and display
+# an appropriate message
+
+num = as.double(readline(prompt="Enter a number: "))
+if(num > 0) {
+    print("Positive number")
+} else {
+    if(num == 0) {
+        print("Zero")
+    } else {
+        print("Negative number")
+    }
+}
+Output 1
+
+Enter a number: -9.6
+[1] "Negative number"
+Output 2
+
+Enter a number: 2
+[1] "Positive number"
--- a/Number.r
+++ b/Number.r
@ -0,0 +1,27 @@
+Check Prime Number
+# Program to check if
+# the input number is
+# prime or not
+
+# take input from the user
+num = as.integer(readline(prompt="Enter a number: "))
+
+flag = 0
+# prime numbers are greater than 1
+if(num > 1) {
+    # check for factors
+    flag = 1
+    for(i in 2:(num-1)) {
+        if ((num %% i) == 0) {
+            flag = 0
+            break
+        }
+    }
+} 
+
+if(num == 2)    flag = 1
+if(flag == 1) {
+    print(paste(num,"is a prime number"))
+} else {
+    print(paste(num,"is not a prime number"))
+}
--- a/r/Compute
+++ b/r/Compute
@ -0,0 +1,30 @@
+Compute LCM in R
+# Program to find the L.C.M. of two input number
+lcm <- function(x, y) {
+    # choose the greater number
+    if(x > y) {
+        greater = x
+    } else {
+        greater = y
+    }
+
+    while(TRUE) {
+        if((greater %% x == 0) && (greater %% y == 0)) {
+            lcm = greater
+            break
+        }
+        greater = greater + 1
+    }
+    return(lcm)
+}
+
+# take input from the user
+num1 = as.integer(readline(prompt = "Enter first number: "))
+num2 = as.integer(readline(prompt = "Enter second number: "))
+
+print(paste("The L.C.M. of", num1,"and", num2,"is", lcm(num1, num2)))
+Output
+
+Enter first number: 24
+Enter second number: 25
+[1] "The L.C.M. of 24 and 25 is 600"
--- a/median.r
+++ b/median.r
@ -0,0 +1,15 @@
+# Goals: Do bootstrap inference, as an example, for a sample median.
+
+library(boot)
+
+samplemedian <- function(x, d) {        # d is a vector of integer indexes
+  return(median(x[d]))                  # The genius is in the x[d] notation
+}
+
+data <- rnorm(50)                          # Generate a dataset with 50 obs
+b  <-  boot(data, samplemedian, R=2000)    # 2000 bootstrap replications
+cat("Sample median has a sigma of ", sd(b$t[,1]), "\n")
+plot(b)
+
+# Make a 99% confidence interval
+boot.ci(b, conf=0.99, type="basic")
--- a/r/Doing
+++ b/r/Doing
@ -0,0 +1,44 @@
+# Goal: Simulate a dataset from the OLS model and obtain
+#       obtain OLS estimates for it.
+
+x <- runif(100, 0, 10)                  # 100 draws from U(0,10)
+y <- 2 + 3*x + rnorm(100)               # beta = [2, 3] and sigma = 1
+
+# You want to just look at OLS results?
+summary(lm(y ~ x))
+
+# Suppose x and y were packed together in a data frame --
+D <- data.frame(x,y)
+summary(lm(y ~ x, D))
+
+# Full and elaborate steps --
+d <- lm(y ~ x)
+# Learn about this object by saying ?lm and str(d)
+# Compact model results --
+print(d)
+# Pretty graphics for regression diagnostics --
+par(mfrow=c(2,2))
+plot(d)
+
+d <- summary(d)
+# Detailed model results --
+print(d)
+# Learn about this object by saying ?summary.lm and by saying str(d)
+cat("OLS gave slope of ", d$coefficients[2,1],
+    "and a error sigma of ", d$sigma, "\n")
+
+
+## I need to drop down to a smaller dataset now --
+x <- runif(10)
+y <- 2 + 3*x + rnorm(10)
+m <- lm(y ~ x)
+
+# Now R supplies a wide range of generic functions which extract
+# useful things out of the result of estimation of many kinds of models.
+
+residuals(m)
+fitted(m)
+AIC(m)
+AIC(m, k=log(10))                        # SBC
+vcov(m)
+logLik(m)
--- a/regression.r
+++ b/regression.r
@ -0,0 +1,55 @@
+# Goal: "Dummy variables" in regression.
+
+# Suppose you have this data:
+people = data.frame(
+  age =       c(21,62,54,49,52,38),
+  education = c("college", "school", "none", "school", "college", "none"),
+  education.code = c(  2,        1,      0,        1,         2,      0 )
+  )
+# Here people$education is a string categorical variable and
+# people$education.code is the same thing, with a numerical coding system.
+people
+
+# Note the structure of the dataset --
+str(people)
+# The strings supplied for `education' have been treated (correctly) as
+# a factor, but education.code is being treated as an integer and not as
+# a factor.
+
+
+# We want to do a dummy variable regression. Normally you would have:
+#  1 Chosen college as the omitted category
+#  2 Made a dummy for "none" named educationnone
+#  3 Made a dummy for "school" named educationschool
+#  4 Ran a regression like lm(age ~ educationnone + educationschool, people)
+# But this is R. Things are cool:
+lm(age ~ education, people)
+
+# ! :-)
+# When you feed him an explanatory variable like education, he does all
+# these steps automatically. (He chose college as the omitted category).
+
+# If you use an integer coding, then the obvious thing goes wrong --
+lm(age ~ education.code, people)
+# because he's thinking that education.code is an integer explanatory
+# variable. So you need to:
+
+lm(age ~ factor(education.code), people)
+# (he choose a different omitted category)
+
+# Alternatively, fix up the dataset --
+people$education.code <- factor(people$education.code)
+lm(age ~ education.code, people)
+
+#
+# Bottom line:
+# Once the dataset has categorical variables correctly represented as factors, i.e. as
+str(people)
+# doing OLS in R induces automatic generation of dummy variables while leaving one out:
+lm(age ~ education, people)
+lm(age ~ education.code, people)
+
+# But what if you want the X matrix?
+m <- lm(age ~ education, people)
+model.matrix(m)
+# This is the design matrix that went into the regression m.
--- a/r/Fibonacci
+++ b/r/Fibonacci
@ -0,0 +1,38 @@
+Fibonacci Sequence in R
+# Program to diplay the Fibonacci
+# sequence up to n-th term using
+# recursive functions
+
+recurse_fibonacci <- function(n) {
+    if(n <= 1) {
+        return(n)
+    } else {
+        return(recurse_fibonacci(n-1) + recurse_fibonacci(n-2))
+    }
+}
+
+# take input from the user
+nterms = as.integer(readline(prompt="How many terms? "))
+
+# check if the number of terms is valid
+if(nterms <= 0) {
+    print("Plese enter a positive integer")
+} else {
+    print("Fibonacci sequence:")
+    for(i in 0:(nterms-1)) {
+        print(recurse_fibonacci(i))
+    }
+}
+Output
+
+How many terms? 9
+[1] "Fibonacci sequence:"
+[1] 0
+[1] 1
+[1] 1
+[1] 2
+[1] 3
+[1] 5
+[1] 8
+[1] 13
+[1] 21
--- a/recursion.r
+++ b/recursion.r
@ -0,0 +1,14 @@
+Find Factorial of a number using recursion
+recur_factorial <- function(n) {
+    if(n <= 1) {
+        return(1)
+    } else { 
+        return(n * recur_factorial(n-1))
+    }
+}
+
+
+Output
+
+> recur_factorial(5)
+[1] 120
--- a/Maximum.r
+++ b/Maximum.r
@ -0,0 +1,33 @@
+Find Minimum and Maximum
+> x
+[1]  5  8  3  9  2  7  4  6 10
+
+> # find the minimum
+> min(x)
+[1] 2
+
+> # find the maximum
+> max(x)
+[1] 10
+
+> # find the range
+> range(x)
+[1]  2 10
+If we want to find where the minimum or maximum is located, i.e. the index instead of the actual value, then we can use which.min() and which.max() functions.
+
+Note that these functions will return the index of the first minimum or maximum in case multiple of them exists.
+
+> x
+[1]  5  8  3  9  2  7  4  6 10
+
+> # find index of the minimum
+> which.min(x)
+[1] 5
+
+> # find index of the minimum
+> which.max(x)
+[1] 9
+
+> # alternate way to find the minimum
+> x[which.min(x)]
+[1] 2
--- a/number.r
+++ b/number.r
@ -0,0 +1,29 @@
+Find factors of a number
+print_factors <- function(x) {
+    print(paste("The factors of",x,"are:"))
+    for(i in 1:x) {
+        if((x %% i) == 0) {
+            print(i)
+        }
+    }
+}
+Output
+
+> print_factors(120)
+[1] "The factors of 120 are:"
+[1] 1
+[1] 2
+[1] 3
+[1] 4
+[1] 5
+[1] 6
+[1] 8
+[1] 10
+[1] 12
+[1] 15
+[1] 20
+[1] 24
+[1] 30
+[1] 40
+[1] 60
+[1] 120
--- a/formula.r
+++ b/formula.r
@ -0,0 +1,21 @@
+Find sum of natural numbers without formula
+# take input from the user
+num = as.integer(readline(prompt = "Enter a number: "))
+
+if(num < 0) {
+    print("Enter a positive number")
+} else {
+    sum = 0
+
+    # use while loop to iterate until zero
+    while(num > 0) {
+        sum = sum + num
+        num = num - 1
+    }
+
+    print(paste("The sum is", sum))
+}
+Output
+
+Enter a number: 10
+[1] "The sum is 55"
--- a/number.r
+++ b/number.r
@ -0,0 +1,20 @@
+#Find the factorial of a number
+# take input from the user
+num = as.integer(readline(prompt="Enter a number: "))
+factorial = 1
+
+# check is the number is negative, positive or zero
+if(num < 0) {
+    print("Sorry, factorial does not exist for negative numbers")
+} else if(num == 0) {
+    print("The factorial of 0 is 1")
+} else {
+    for(i in 1:num) {
+        factorial = factorial * i
+    }
+    print(paste("The factorial of", num ,"is",factorial))
+}
+Output
+
+Enter a number: 8
+[1] "The factorial of 8 is 40320"
--- a/Distribution.r
+++ b/Distribution.r
@ -0,0 +1,8 @@
+> runif(1)    # generates 1 random number
+[1] 0.3984754
+
+> runif(3)    # generates 3 random number
+[1] 0.8090284 0.1797232 0.6803607
+
+> runif(3, min=5, max=10)    # define the range between 5 and 10
+[1] 7.099781 8.355461 5.173133
--- a/r/Handling
+++ b/r/Handling
@ -0,0 +1,22 @@
+# Goal:
+#       A stock is traded on 2 exchanges.
+#       Price data is missing at random on both exchanges owing to non-trading.
+#       We want to make a single price time-series utilising information
+#          from both exchanges. I.e., missing data for exchange 1 will
+#          be replaced by information for exchange 2 (if observed).
+
+# Let's create some example data for the problem.
+e1 <- runif(15)                         # Prices on exchange 1
+e2 <- e1 + 0.05*rnorm(15)               # Prices on exchange 2.
+cbind(e1, e2)
+# Blow away 5 points from each at random.
+e1[sample(1:15, 5)] <- NA
+e2[sample(1:15, 5)] <- NA
+cbind(e1, e2)
+
+# Now how do we reconstruct a time-series that tries to utilise both?
+combined <- e1                          # Do use the more liquid exchange here.
+missing <- is.na(combined)
+combined[missing] <- e2[missing]        # if it's also missing, I don't care.
+cbind(e1, e2, combined)
+# There you are.
--- a/tables.r
+++ b/tables.r
@ -0,0 +1,43 @@
+# Goal: Joint distributions, marginal distributions, useful tables.
+
+# First let me invent some fake data
+set.seed(102)                           # This yields a good illustration.
+x <- sample(1:3, 15, replace=TRUE)
+education <- factor(x, labels=c("None", "School", "College"))
+x <- sample(1:2, 15, replace=TRUE)
+gender <- factor(x, labels=c("Male", "Female"))
+age <- runif(15, min=20,max=60)
+
+D <- data.frame(age, gender, education)
+rm(x,age,gender,education)
+print(D)
+
+# Table about education
+table(D$education)
+
+# Table about education and gender --
+table(D$gender, D$education)
+# Joint distribution of education and gender --
+table(D$gender, D$education)/nrow(D)
+
+# Add in the marginal distributions also
+addmargins(table(D$gender, D$education))
+addmargins(table(D$gender, D$education))/nrow(D)
+
+# Generate a good LaTeX table out of it --
+library(xtable)
+xtable(addmargins(table(D$gender, D$education))/nrow(D),
+       digits=c(0,2,2,2,2))             # You have to do | and \hline manually.
+
+# Study age by education category
+by(D$age, D$gender, mean)
+by(D$age, D$gender, sd)
+by(D$age, D$gender, summary)
+
+# Two-way table showing average age depending on education & gender
+a <- matrix(by(D$age, list(D$gender, D$education), mean), nrow=2)
+rownames(a) <- levels(D$gender)
+colnames(a) <- levels(D$education)
+print(a)
+# or, of course,
+print(xtable(a))
--- a/model.r
+++ b/model.r
@ -0,0 +1,23 @@
+# Goals: Simulate a dataset from a "fixed effects" model, and
+#        obtain "least squares dummy variable" (LSDV) estimates.
+#
+# We do this in the context of a familiar "earnings function" -
+#  log earnings is quadratic in log experience, with parallel shifts by
+#  education category.
+
+# Create an education factor with 4 levels --
+education <- factor(sample(1:4,1000, replace=TRUE),
+                    labels=c("none", "school", "college", "beyond"))
+# Simulate an experience variable with a plausible range --
+experience <- 30*runif(1000)            # experience from 0 to 20 years
+# Make the intercept vary by education category between 4 given values --
+intercept <- c(0.5,1,1.5,2)[education]
+
+# Simulate the log earnings --
+log.earnings <- intercept +
+  2*experience - 0.05*experience*experience + rnorm(1000)
+A <- data.frame(education, experience, e2=experience*experience, log.earnings)
+summary(A)
+
+# The OLS path to LSDV --
+summary(lm(log.earnings ~ -1 + education + experience + e2, A))
--- a/package.r
+++ b/package.r
@ -0,0 +1,31 @@
+# Goal: Make a time-series object using the "zoo" package
+
+A <- data.frame(date=c("1995-01-01", "1995-01-02", "1995-01-03", "1995-01-06"),
+                x=runif(4),
+                y=runif(4))
+A$date <- as.Date(A$date) # yyyy-mm-dd is the default format
+# So far there's nothing new - it's just a data frame. I have hand-
+# constructed A but you could equally have obtained it using read.table().
+
+# I want to make a zoo matrix out of the numerical columns of A
+library(zoo)
+B <- A
+B$date <- NULL
+z <- zoo(as.matrix(B), order.by=A$date)
+rm(A, B)
+
+# So now you are holding "z", a "zoo" object. You can do many cool
+# things with it.
+# See http://www.google.com/search?hl=en&q=zoo+quickref+achim&btnI=I%27m+Feeling+Lucky
+
+# To drop down to a plain data matrix, say
+C <- coredata(z)
+rownames(C) <- as.character(time(z))
+# Compare --
+str(C)
+str(z)
+
+# The above is a tedious way of doing these things, designed to give you
+# an insight into what is going on. If you just want to read a file
+# into a zoo object, a very short path is something like:
+#        z <- read.zoo(filename, format="%d %b %Y")
--- a/paper.r
+++ b/paper.r
@ -0,0 +1,20 @@
+# Goal: Make pictures in PDF files that can be put into a paper.
+
+xpts <- seq(-3,3,.05)
+
+# Here is my suggested setup for a two-column picture --
+pdf("demo2.pdf", width=5.6, height=2.8, bg="cadetblue1", pointsize=8)
+par(mai=c(.6,.6,.2,.2))
+plot(xpts, sin(xpts*xpts), type="l", lwd=2, col="cadetblue4",
+     xlab="x", ylab="sin(x*x)")
+grid(col="white", lty=1, lwd=.2)
+abline(h=0, v=0)
+
+# My suggested setup for a square one-column picture --
+pdf("demo1.pdf", width=2.8, height=2.8, bg="cadetblue1", pointsize=8)
+par(mai=c(.6,.6,.2,.2))
+plot(xpts, sin(xpts*xpts), type="l", lwd=2, col="cadetblue4",
+     xlab="x", ylab="sin(x*x)")
+grid(col="white", lty=1, lwd=.2)
+abline(h=0, v=0)
+
--- a/r/Multiplication
+++ b/r/Multiplication
@ -0,0 +1,25 @@
+Multiplication Table
+# Program to find the multiplication
+# table (from 1 to 10)
+# of a number input by the user
+
+# take input from the user
+num = as.integer(readline(prompt = "Enter a number: "))
+
+# use for loop to iterate 10 times
+for(i in 1:10) {
+    print(paste(num,'x', i, '=', num*i))
+}
+Output
+
+Enter a number: 7
+[1] "7 x 1 = 7"
+[1] "7 x 2 = 14"
+[1] "7 x 3 = 21"
+[1] "7 x 4 = 28"
+[1] "7 x 5 = 35"
+[1] "7 x 6 = 42"
+[1] "7 x 7 = 49"
+[1] "7 x 8 = 56"
+[1] "7 x 9 = 63"
+[1] "7 x 10 = 70"
--- a/r/Plotting
+++ b/r/Plotting
@ -0,0 +1,13 @@
+# Goal: Display two series on one plot, one with a left y axis
+#       and another with a right y axis.
+
+y1 <- cumsum(rnorm(100))
+y2 <- cumsum(rnorm(100, mean=0.2))
+
+par(mai=c(.8, .8, .2, .8))
+plot(1:100, y1, type="l", col="blue", xlab="X axis label", ylab="Left legend")
+par(new=TRUE)
+plot(1:100, y2, type="l", ann=FALSE, yaxt="n")
+axis(4)
+legend(x="topleft", bty="n", lty=c(1,1), col=c("blue","black"),
+       legend=c("String 1 (left scale)", "String 2 (right scale)"))
--- a/returns.r
+++ b/returns.r
@ -0,0 +1,99 @@
+# Goal: Prices and returns
+
+# I like to multiply returns by 100 so as to have "units in percent".
+# In other words, I like it for 5% to be a value like 5 rather than 0.05.
+
+###################################################################
+# I. Simulate random-walk prices, switch between prices & returns.
+###################################################################
+# Simulate a time-series of PRICES drawn from a random walk
+# where one-period returns are i.i.d. N(mu, sigma^2).
+ranrw <- function(mu, sigma, p0=100, T=100) {
+  cumprod(c(p0, 1 + (rnorm(n=T, mean=mu, sd=sigma)/100)))
+}
+prices2returns <- function(x) {
+  100*diff(log(x))
+}
+returns2prices <- function(r, p0=100) {
+  c(p0, p0 * exp(cumsum(r/100)))
+}
+
+cat("Simulate 25 points from a random walk starting at 1500 --\n")
+p <- ranrw(0.05, 1.4, p0=1500, T=25)
+    # gives you a 25-long series, starting with a price of 1500, where
+    # one-period returns are N(0.05,1.4^2) percent.
+print(p)
+
+cat("Convert to returns--\n")
+r <- prices2returns(p)
+print(r)
+
+cat("Go back from returns to prices --\n")
+goback <- returns2prices(r, 1500)
+print(goback)
+
+###################################################################
+# II. Plenty of powerful things you can do with returns....
+###################################################################
+summary(r); sd(r)                       # summary statistics
+plot(density(r))                        # kernel density plot
+acf(r)                                  # Autocorrelation function
+ar(r)                                   # Estimate a AIC-minimising AR model
+Box.test(r, lag=2, type="Ljung")        # Box-Ljung test
+library(tseries)
+runs.test(factor(sign(r)))              # Runs test
+bds.test(r)                             # BDS test.
+
+###################################################################
+# III. Visualisation and the random walk
+###################################################################
+# I want to obtain intuition into what kinds of price series can happen,
+# given a starting price, a mean return, and a given standard deviation.
+# This function simulates out 10000 days of a price time-series at a time,
+# and waits for you to click in the graph window, after which a second
+# series is painted, and so on. Make the graph window very big and
+# sit back and admire.
+# The point is to eyeball many series and thus obtain some intuition
+# into what the random walk does.
+visualisation <- function(p0, s, mu, labelstring) {
+  N <- 10000
+  x <- (1:(N+1))/250                        # Unit of years
+  while (1) {
+    plot(x, ranrw(mu, s, p0, N), ylab="Level", log="y",
+         type="l", col="red", xlab="Time (years)",
+         main=paste("40 years of a process much like", labelstring))
+    grid()
+    z=locator(1)
+  }
+}
+
+# Nifty -- assuming sigma of 1.4% a day and E(returns) of 13% a year
+visualisation(2600, 1.4, 13/250, "Nifty")
+
+# The numerical values here are used to think about what the INR/USD
+# exchange rate would have looked like if it started from 31.37, had
+# a mean depreciation of 5% per year, and had the daily vol of a floating
+# exchange rate like EUR/USD.
+visualisation(31.37, 0.7, 5/365, "INR/USD (NOT!) with daily sigma=0.7")
+# This is of course not like the INR/USD series in the real world -
+# which is neither a random walk nor does it have a vol of 0.7% a day.
+
+# The numerical values here are used to think about what the USD/EUR
+# exchange rate, starting with 1, having no drift, and having the observed
+# daily vol of 0.7. (This is about right).
+visualisation(1, 0.7, 0, "USD/EUR with no drift")
+
+###################################################################
+# IV. A monte carlo experiment about the runs test
+###################################################################
+# Measure the effectiveness of the runs test when faced with an
+# AR(1) process of length 100 with a coeff of 0.1
+set.seed(101)
+one.ts <- function() {arima.sim(list(order = c(1,0,0), ar = 0.1), n=100)}
+table(replicate(1000, runs.test(factor(sign(one.ts())))$p.value < 0.05))
+# We find that the runs test throws up a prob value of below 0.05
+# for 91 out of 1000 experiments.
+# Wow! :-)
+# To understand this, you need to look up the man pages of:
+#    set.seed, arima.sim, sign, factor, runs.test, replicate, table.
+# e.g. say ?replicate
--- a/Sequence.r
+++ b/Sequence.r
@ -0,0 +1,41 @@
+Print Fibonacci Sequence
+# take input from the user
+nterms = as.integer(readline(prompt="How many terms? "))
+
+# first two terms
+n1 = 0
+n2 = 1
+count = 2
+
+# check if the number of terms is valid
+if(nterms <= 0) {
+    print("Plese enter a positive integer")
+} else {
+    if(nterms == 1) {
+        print("Fibonacci sequence:")
+        print(n1)
+    } else {
+        print("Fibonacci sequence:")
+        print(n1)
+        print(n2)
+        while(count < nterms) {
+            nth = n1 + n2
+            print(nth)
+            # update values
+            n1 = n2
+            n2 = nth
+            count = count + 1
+        }
+    }
+}
+Output
+
+How many terms? 7
+[1] "Fibonacci sequence:"
+[1] 0
+[1] 1
+[1] 1
+[1] 2
+[1] 3
+[1] 5
+[1] 8 
--- a/r/Program
+++ b/r/Program
@ -0,0 +1,30 @@
+Program to Find GCD
+# Program to find the
+# H.C.F of two input number
+
+# define a function
+hcf <- function(x, y) {
+    # choose the smaller number
+    if(x > y) {
+        smaller = y
+    } else {
+        smaller = x
+    }
+    for(i in 1:smaller) {
+        if((x %% i == 0) && (y %% i == 0)) {
+            hcf = i
+        }
+    }
+    return(hcf)
+}
+
+# take input from the user
+num1 = as.integer(readline(prompt = "Enter first number: "))
+num2 = as.integer(readline(prompt = "Enter second number: "))
+
+print(paste("The H.C.F. of", num1,"and", num2,"is", hcf(num1, num2)))
+Output
+
+Enter first number: 72
+Enter second number: 120
+[1] "The H.C.F. of 72 and 120 is 24"
--- a/r/Quartiles
+++ b/r/Quartiles
@ -0,0 +1,52 @@
+# Get the data in place --
+load(file="demo.rda")
+summary(firms)
+
+# Look at it --
+plot(density(log(firms$mktcap)))
+plot(firms$mktcap, firms$spread, type="p", cex=.2, col="blue", log="xy",
+     xlab="Market cap (Mln USD)", ylab="Bid/offer spread (bps)")
+m=lm(log(spread) ~ log(mktcap), firms)
+summary(m)
+
+# Making deciles --
+library(gtools)
+library(gdata)
+                                      # for deciles (default=quartiles)
+size.category = quantcut(firms$mktcap, q=seq(0, 1, 0.1), labels=F)
+table(size.category)
+means = aggregate(firms, list(size.category), mean)
+print(data.frame(means$mktcap,means$spread))
+
+# Make a picture combining the sample mean of spread (in each decile)
+# with the weighted average sample mean of the spread (in each decile),
+# where weights are proportional to size.
+wtd.means = by(firms, size.category,
+  function(piece) (sum(piece$mktcap*piece$spread)/sum(piece$mktcap)))
+lines(means$mktcap, means$spread, type="b", lwd=2, col="green", pch=19)
+lines(means$mktcap, wtd.means, type="b", lwd=2, col="red", pch=19)
+legend(x=0.25, y=0.5, bty="n",
+       col=c("blue", "green", "red"),
+       lty=c(0, 1, 1), lwd=c(0,2,2),
+       pch=c(0,19,19),
+       legend=c("firm", "Mean spread in size deciles",
+         "Size weighted mean spread in size deciles"))
+
+# Within group standard deviations --
+aggregate(firms, list(size.category), sd)
+
+# Now I do quartiles by BOTH mktcap and spread.
+size.quartiles = quantcut(firms$mktcap, labels=F)
+spread.quartiles = quantcut(firms$spread, labels=F)
+table(size.quartiles, spread.quartiles)
+# Re-express everything as joint probabilities
+table(size.quartiles, spread.quartiles)/nrow(firms)
+# Compute cell means at every point in the joint table:
+aggregate(firms, list(size.quartiles, spread.quartiles), mean)
+
+# Make pretty two-way tables
+aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, nobs)
+aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, mean)
+aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, sd)
+aggregate.table(firms$spread, size.quartiles, spread.quartiles, mean)
+aggregate.table(firms$spread, size.quartiles, spread.quartiles, sd)
--- a/Program.r
+++ b/Program.r
@ -0,0 +1,11 @@
+> # We can use the print() function
+> print("Hello World!")
+[1] "Hello World!"
+
+> # Quotes can be suppressed in the output
+> print("Hello World!", quote = FALSE)
+[1] Hello World!
+
+> # If there are more than 1 item, we can concatenate using paste()
+> print(paste("How","are","you?"))
+[1] "How are you?"
--- a/function.r
+++ b/function.r
@ -0,0 +1,12 @@
+# Goal: R syntax where model specification is an argument to a function.
+
+# Invent a dataset
+x <- runif(100); y <- runif(100); z <- 2 + 3*x + 4*y + rnorm(100)
+D <- data.frame(x=x, y=y, z=z)
+
+amodel <- function(modelstring) {
+  summary(lm(modelstring, D))
+}
+
+amodel(z ~ x)
+amodel(z ~ y)
--- a/r/README.md
+++ b/r/README.md
@ -0,0 +1,3 @@
+# R programming language
+
+R is a programming language designed for statistical computing and graphics purposes. Contains code that can be executed within the R software environment.
--- a/r/Reading
+++ b/r/Reading
@ -0,0 +1,43 @@
+# Goal: Reading and writing ascii files, reading and writing binary files.
+#       And, to measure how much faster it is working with binary files.
+
+# First manufacture a tall data frame:
+                # FYI -- runif(10) yields 10 U(0,1) random numbers.
+B = data.frame(x1=runif(100000), x2=runif(100000), x3=runif(100000))
+summary(B)
+
+# Write out ascii file:
+write.table(B, file = "/tmp/foo.csv", sep = ",", col.names = NA)
+# Read in this resulting ascii file:
+C=read.table("/tmp/foo.csv", header = TRUE, sep = ",", row.names=1)
+# Write a binary file out of dataset C:
+save(C, file="/tmp/foo.binary")
+# Delete the dataset C:
+rm(C)
+# Restore from foo.binary:
+load("/tmp/foo.binary")
+summary(C)                              # should yield the same results
+                                        # as summary(B) above.
+
+
+# Now we time all these operations --
+cat("Time creation of dataset:\n")
+system.time({
+  B = data.frame(x1=runif(100000), x2=runif(100000), x3=runif(100000))
+})
+
+cat("Time writing an ascii file out of dataset B:\n")
+system.time(
+            write.table(B, file = "/tmp/foo.csv", sep = ",", col.names = NA)
+            )
+
+cat("Time reading an ascii file into dataset C:\n")
+system.time(
+            {C=read.table("/tmp/foo.csv", header = TRUE, sep=",", row.names=1)
+           })
+
+cat("Time writing a binary file out of dataset C:\n")
+system.time(save(C, file="/tmp/foo.binary"))
+
+cat("Time reading a binary file + variablenames from /tmp/foo.binary:\n")
+system.time(load("/tmp/foo.binary"))    # and then read it in from binary file
--- a/r/Reading
+++ b/r/Reading
@ -0,0 +1,15 @@
+# Goal: Reading in a Microsoft .xls file directly
+
+library(gdata)
+a <- read.xls("file.xls", sheet=2)                # This reads in the 2nd sheet
+
+# Look at what the cat dragged in
+str(a)
+
+# If you have a date column, you'll want to fix it up like this:
+a$date <- as.Date(as.character(a$X), format="%d-%b-%y")
+a$X <- NULL
+
+
+# Also see http://tolstoy.newcastle.edu.au/R/help/06/04/25674.html for
+# another path.
--- a/program.r
+++ b/program.r
@ -0,0 +1,17 @@
+# Goal: To read in files produced by CMIE's "Business Beacon".
+#       This assumes you have made a file of MONTHLY data using CMIE's
+#       Business Beacon program. This contains 2 columns: M3 and M0.
+
+A <- read.table(
+                # Generic to all BB files --
+                sep="|",                # CMIE's .txt file is pipe delimited
+                skip=3,                 # Skip the 1st 3 lines
+                na.strings=c("N.A.","Err"),  # The ways they encode missing data
+                # Specific to your immediate situation --
+                file="bb_data.text",
+                col.names=c("junk", "date", "M3", "M0")
+                )
+A$junk <- NULL                          # Blow away this column
+
+# Parse the CMIE-style "Mmm yy" date string that's used on monthly data
+A$date <- as.Date(paste("1", as.character(A$date)), format="%d %b %Y")
--- a/Population.r
+++ b/Population.r
@ -0,0 +1,16 @@
+> # sample with replacement
+> sample(x, replace = TRUE)
+[1] 15 17 13  9  5 15 11 15  1
+
+> # if we simply pass in a positive number n, it will sample
+> # from 1:n without replacement
+> sample(10)
+ [1]  2  4  7  9  1  3 10  5  8  6
+
+
+
+
+ An example to simulate a coin toss for 10 times.
+
+> sample(c("H","T"),10, replace = TRUE)
+ [1] "H" "H" "H" "T" "H" "T" "H" "H" "H" "T"
--- a/distribution.r
+++ b/distribution.r
@ -0,0 +1,40 @@
+# Goals: Scare the hell out of children with the Cauchy distribution.
+
+# A function which simulates N draws from one of two distributions,
+# and returns the mean obtained thusly.
+one.simulation <- function(N=100, distribution="normal") {
+  if (distribution == "normal") {
+    x <- rnorm(N)
+  } else {
+    x <- rcauchy(N)
+  }
+  mean(x)
+}
+
+k1 <- density(replicate(1000, one.simulation(20)))
+k2 <- density(replicate(1000, one.simulation(20, distribution="cauchy")))
+
+xrange <- range(k1$x, k2$x)
+plot(k1$x, k1$y, xlim=xrange, type="l", xlab="Estimated value", ylab="")
+grid()
+lines(k2$x, k2$y, col="red")
+abline(v=.5)
+legend(x="topleft", bty="n",
+       lty=c(1,1),
+       col=c("black", "red"),
+       legend=c("Mean of Normal", "Mean of Cauchy"))
+# The distribution of the mean of normals collapses into a point;
+# that of the cauchy does not.
+
+# Here's more scary stuff --
+for (i in 1:10) {
+  cat("Sigma of distribution of 1000 draws from mean of normal - ",
+      sd(replicate(1000, one.simulation(20))), "\n")
+}
+for (i in 1:10) {
+  cat("Sigma of distribution of 1000 draws from mean of cauchy - ",
+      sd(replicate(1000, one.simulation(20, distribution="cauchy"))), "\n")
+}
+
+# Exercise for the reader: Compare the distribution of the median of
+# the Normal against the distribution of the median of the Cauchy.
--- a/r/Sending
+++ b/r/Sending
@ -0,0 +1,33 @@
+# Goals: Lots of times, you need to give an R object to a friend,
+#        or embed data into an email.
+
+# First I invent a little dataset --
+set.seed(101)   # To make sure you get the same random numbers as me
+                # FYI -- runif(10) yields 10 U(0,1) random numbers.
+A = data.frame(x1=runif(10), x2=runif(10), x3=runif(10))
+# Look at it --
+print(A)
+
+# Writing to a binary file that can be transported
+save(A, file="/tmp/my_data_file.rda")   # You can give this file to a friend
+load("/tmp/my_data_file.rda")
+
+# Plan B - you want pure ascii, which can be put into an email --
+dput(A)
+# This gives you a block of R code. Let me utilise that generated code
+# to create a dataset named "B".
+B <- structure(list(x1 = c(0.372198376338929, 0.0438248154241592,
+0.709684018278494, 0.657690396532416, 0.249855723232031, 0.300054833060130,
+0.584866625955328, 0.333467143354937, 0.622011963743716, 0.54582855431363
+), x2 = c(0.879795730113983, 0.706874740775675, 0.731972594512627,
+0.931634427979589, 0.455120594473556, 0.590319729177281, 0.820436094887555,
+0.224118480458856, 0.411666829371825, 0.0386105608195066), x3 = c(0.700711545301601,
+0.956837461562827, 0.213352001970634, 0.661061500199139, 0.923318882007152,
+0.795719761401415, 0.0712125543504953, 0.389407767681405, 0.406451216200367,
+0.659355078125373)), .Names = c("x1", "x2", "x3"), row.names = c("1",
+"2", "3", "4", "5", "6", "7", "8", "9", "10"), class = "data.frame")
+
+# Verify that A and B are near-identical --
+A-B
+# or,
+all.equal(A,B)
--- a/plot.r
+++ b/plot.r
@ -0,0 +1,18 @@
+# Goal: Display of a macroeconomic time-series, with a filled colour
+#       bar showing a recession.
+
+years <- 1950:2000
+timeseries <- cumsum(c(100, runif(50)*5))
+hilo <- range(timeseries)
+plot(years, timeseries, type="l", lwd=3)
+# A recession from 1960 to 1965 --
+polygon(x=c(1960,1960, 1965,1965),
+        y=c(hilo, rev(hilo)),
+        density=NA, col="orange", border=NA)
+lines(years, timeseries, type="l", lwd=3) # paint again so line comes on top
+
+# alternative method -- though not as good looking --
+# library(plotrix)
+# gradient.rect(1960, hilo[1], 1965, hilo[2],
+#               reds=c(0,1), greens=c(0,0), blues=c(0,0),
+#               gradient="y")
--- a/median.r
+++ b/median.r
@ -0,0 +1,26 @@
+# Goal: Show the efficiency of the mean when compared with the median
+#       using a large simulation where both estimators are applied on
+#       a sample of U(0,1) uniformly distributed random numbers.
+
+one.simulation <- function(N=100) {     # N defaults to 100 if not supplied
+  x <- runif(N)
+  return(c(mean(x), median(x)))
+}
+
+# Simulation --
+results <- replicate(100000, one.simulation(20)) # Gives back a 2x100000 matrix
+
+# Two kernel densities --
+k1 <- density(results[1,])              # results[1,] is the 1st row
+k2 <- density(results[2,])
+
+# A pretty picture --
+xrange <- range(k1$x, k2$x)
+plot(k1$x, k1$y, xlim=xrange, type="l", xlab="Estimated value", ylab="")
+grid()
+lines(k2$x, k2$y, col="red")
+abline(v=.5)
+legend(x="topleft", bty="n",
+       lty=c(1,1),
+       col=c("black", "red"),
+       legend=c("Mean", "Median"))
--- a/r/Simple
+++ b/r/Simple
@ -0,0 +1,48 @@
+Simple Calculator in R
+# Program make a simple calculator
+# that can add, subtract, multiply
+# and divide using functions
+
+add <- function(x, y) {
+    return(x + y)
+}
+
+subtract <- function(x, y) {
+    return(x - y)
+}
+
+multiply <- function(x, y) {
+    return(x * y)
+}
+
+divide <- function(x, y) {
+    return(x / y)
+}
+
+# take input from the user
+print("Select operation.")
+print("1.Add")
+print("2.Subtract")
+print("3.Multiply")
+print("4.Divide")
+
+choice = as.integer(readline(prompt="Enter choice[1/2/3/4]: "))
+
+num1 = as.integer(readline(prompt="Enter first number: "))
+num2 = as.integer(readline(prompt="Enter second number: "))
+
+operator <- switch(choice,"+","-","*","/")
+result <- switch(choice, add(num1, num2), subtract(num1, num2), multiply(num1, num2), divide(num1, num2))
+
+print(paste(num1, operator, num2, "=", result))
+Output
+
+[1] "Select operation."
+[1] "1.Add"
+[1] "2.Subtract"
+[1] "3.Multiply"
+[1] "4.Divide"
+Enter choice[1/2/3/4]: 4
+Enter first number: 20
+Enter second number: 4
+[1] "20 / 4 = 5"
--- a/r/Simulation
+++ b/r/Simulation
@ -0,0 +1,53 @@
+# Goal: Simulation to study size and power in a simple problem.
+
+set.seed(101)
+
+# The data generating process: a simple uniform distribution with stated mean
+dgp <- function(N,mu) {runif(N)-0.5+mu}
+
+# Simulate one FIXED hypothesis test for H0:mu=0, given a true mu for a sample size N
+one.test <- function(N, truemu) {
+  x <- dgp(N,truemu)
+  muhat <- mean(x)
+  s <- sd(x)/sqrt(N)
+  # Under the null, the distribution of the mean has standard error s
+  threshold <- 1.96*s
+  (muhat < -threshold) || (muhat > threshold)
+} # Return of TRUE means reject the null
+
+# Do one experiment, where the fixed H0:mu=0 is run Nexperiments times with a sample size N.
+# We return only one number: the fraction of the time that H0 is rejected.
+experiment <- function(Nexperiments, N, truemu) {
+  sum(replicate(Nexperiments, one.test(N, truemu)))/Nexperiments
+}
+
+# Measure the size of a test, i.e. rejections when H0 is true
+experiment(10000, 50, 0)
+# Measurement with sample size of 50, and true mu of 0.
+
+# Power study: I.e. Pr(rejection) when H0 is false
+# (one special case in here is when the H0 is actually true)
+
+muvalues <- seq(-.15,.15,.01)
+  # When true mu < -0.15 and when true mu > 0.15,
+  # the Pr(rejection) veers to 1 (full power) and it's not interesting.
+
+# First do this with sample size of 50
+results <- NULL
+for (truth in muvalues) {
+  results <- c(results, experiment(10000, 50, truth))
+}
+par(mai=c(.8,.8,.2,.2))
+plot(muvalues, results, type="l", lwd=2, ylim=c(0,1),
+     xlab="True mu", ylab="Pr(Rejection of H0:mu=0)")
+abline(h=0.05, lty=2)
+
+# Now repeat this with sample size of 100 (should yield a higher power)
+results <- NULL
+for (truth in muvalues) {
+  results <- c(results, experiment(10000, 100, truth))
+}
+lines(muvalues, results, lwd=2, col="blue")
+legend(x=-0.15, y=.2, lwd=c(2,1,2), lty=c(1,2,1), cex=.8,
+       col=c("black","black","blue"), bty="n",
+       legend=c("N=50", "Size, 0.05", "N=100"))
--- a/Vector.r
+++ b/Vector.r
@ -0,0 +1,25 @@
+Sort a Vector
+> x
+[1] 7 1 8 3 2 6 5 2 2 4
+
+> # sort in ascending order
+> sort(x)
+[1] 1 2 2 2 3 4 5 6 7 8
+
+> # sort in descending order
+> sort(x, decreasing=TRUE)
+[1] 8 7 6 5 4 3 2 2 2 1
+
+> # vector x remains unaffected
+> x
+[1] 7 1 8 3 2 6 5 2 2 4
+Sometimes we would want the index of the sorted vector instead of the values. In such case we can use the function order().
+
+> order(x)
+[1]  2  5  8  9  4 10  7  6  1  3
+
+> order(x, decreasing=TRUE)
+[1]  3  1  6  7 10  4  5  8  9  2
+
+> x[order(x)]    # this will also sort x
+[1] 1 2 2 2 3 4 5 6 7 8
--- a/r/Special
+++ b/r/Special
@ -0,0 +1,22 @@
+# Goal: Special cases in reading files
+
+# Reading in a .bz2 file --
+read.table(bzfile("file.text.bz2"))           # Requires you have ./file.text.bz2
+
+# Reading in a .gz file --
+read.table(gzfile("file.text.gz"))            # Requires you have ./file.text.bz2
+
+# Reading from a pipe --
+mydata <- read.table(pipe("awk -f filter.awk input.txt"))
+
+# Reading from a URL --
+read.table(url("http://www.mayin.org/ajayshah/A/demo.text"))
+
+# This also works --
+read.table("http://www.mayin.org/ajayshah/A/demo.text")
+
+# Hmm, I couldn't think of how to read a .bz2 file from a URL. How about:
+read.table(pipe("links -source http://www.mayin.org/ajayshah/A/demo.text.bz2 | bunzip2"))
+
+# Reading binary files from a URL --
+load(url("http://www.mayin.org/ajayshah/A/nifty_weekly_returns.rda"))
--- a/distributions.r
+++ b/distributions.r
@ -0,0 +1,30 @@
+# Goal: Standard computations with well-studied distributions.
+
+# The normal distribution is named "norm". With this, we have:
+
+# Normal density
+dnorm(c(-1.96,0,1.96))
+
+# Cumulative normal density
+pnorm(c(-1.96,0,1.96))
+
+# Inverse of this
+qnorm(c(0.025,.5,.975))
+pnorm(qnorm(c(0.025,.5,.975)))
+
+# 1000 random numbers from the normal distribution
+summary(rnorm(1000))
+
+
+# Here's the same ideas, for the chi-squared distribution with 10 degrees
+# of freedom.
+dchisq(c(0,5,10), df=10)
+
+# Cumulative normal density
+pchisq(c(0,5,10), df=10)
+
+# Inverse of this
+qchisq(c(0.025,.5,.975), df=10)
+
+# 1000 random numbers from the normal distribution
+summary(rchisq(1000, df=10))
--- a/r/Standard
+++ b/r/Standard
@ -0,0 +1,24 @@
+# Goal: Some of the standard tests
+
+# A classical setting --
+x <- runif(100, 0, 10)                  # 100 draws from U(0,10)
+y <- 2 + 3*x + rnorm(100)               # beta = [2, 3] and sigma is 1
+d <- lm(y ~ x)
+# CLS results --
+summary(d)
+
+library(sandwich)
+library(lmtest)
+# Durbin-Watson test --
+dwtest(d, alternative="two.sided")
+# Breusch-Pagan test --
+bptest(d)
+# Heteroscedasticity and autocorrelation consistent (HAC) tests
+coeftest(d, vcov=kernHAC)
+
+# Tranplant the HAC values back in --
+library(xtable)
+sum.d <- summary(d)
+xtable(sum.d)
+sum.d$coefficients[1:2,1:4] <- coeftest(d, vcov=kernHAC)[1:2,1:4]
+xtable(sum.d)
--- a/user.r
+++ b/user.r
@ -0,0 +1,13 @@
+my.name <- readline(prompt="Enter name: ")
+my.age <- readline(prompt="Enter age: ")
+
+# convert character into integer
+my.age <- as.integer(my.age)
+
+print(paste("Hi,", my.name, "next year you will be", my.age+1, "years old."))
+
+Output
+
+Enter name: Mary
+Enter age: 17
+[1] "Hi, Mary next year you will be 18 years old."
--- a/notation.r
+++ b/notation.r
@ -0,0 +1,116 @@
+# Goal: The amazing R vector notation.
+
+cat("EXAMPLE 1: sin(x) for a vector --\n")
+# Suppose you have a vector x --
+x = c(0.1,0.6,1.0,1.5)
+
+# The bad way --
+n = length(x)
+r = numeric(n)
+for (i in 1:n) {
+  r[i] = sin(x[i])
+}
+print(r)
+
+# The good way -- don't use loops --
+print(sin(x))
+
+
+cat("\n\nEXAMPLE 2: Compute the mean of every row of a matrix --\n")
+# Here's another example. It isn't really about R; it's about thinking in
+# matrix notation. But still.
+# Let me setup a matrix --
+N=4; M=100;
+r = matrix(runif(N*M), N, M)
+
+# So I face a NxM matrix
+#               [r11 r12 ... r1N]
+#               [r21 r22 ... r2N]
+#               [r32 r32 ... r3N]
+# My goal: each column needs to be reduced to a mean.
+
+# Method 1 uses loops:
+mean1 = numeric(M)
+for (i in 1:M) {
+  mean1[i] = mean(r[,i])
+}
+
+# Alternatively, just say:
+mean2 = rep(1/N, N) %*% r               # Pretty!
+
+# The two answers are the same --
+all.equal(mean1,mean2[,])
+#
+# As an aside, I should say that you can do this directly by using
+# the rowMeans() function. But the above is more about pedagogy rather
+# than showing you how to get rowmeans.
+
+
+cat("\n\nEXAMPLE 3: Nelson-Siegel yield curve\n")
+# Write this asif you're dealing with scalars --
+# Nelson Siegel function
+nsz <- function(b0, b1, b2, tau, t) {
+  tmp = t/tau
+  tmp2 = exp(-tmp)
+  return(b0 + ((b1+b2)*(1-tmp2)/(tmp)) - (b2*tmp2))
+}
+
+timepoints <- c(0.01,1:5)
+
+# The bad way:
+z <- numeric(length(timepoints))
+for (i in 1:length(timepoints)) {
+  z[i] <- nsz(14.084,-3.4107,0.0015,1.8832,timepoints[i])
+}
+print(z)
+
+# The R way --
+print(z <- nsz(14.084,-3.4107,0.0015,1.8832,timepoints))
+
+
+cat("\n\nEXAMPLE 3: Making the NPV of a bond--\n")
+# You know the bad way - sum over all cashflows, NPVing each.
+# Now look at the R way.
+C = rep(100, 6)
+nsz(14.084,-3.4107,0.0015,1.8832,timepoints)        # Print interest rates
+C/((1.05)^timepoints)                               # Print cashflows discounted @ 5%
+C/((1 + (0.01*nsz(14.084,-3.4107,0.0015,1.8832,timepoints))^timepoints)) # Using NS instead of 5%
+# NPV in two different ways --
+C %*% (1 + (0.01*nsz(14.084,-3.4107,0.0015,1.8832,timepoints)))^-timepoints
+sum(C * (1 + (0.01*nsz(14.084,-3.4107,0.0015,1.8832,timepoints)))^-timepoints)
+# You can drop back to a flat yield curve at 5% easily --
+sum(C * 1.05^-timepoints)
+
+# Make a function for NPV --
+npv <- function(C, timepoints, r) {
+  return(sum(C * (1 + (0.01*r))^-timepoints))
+}
+npv(C, timepoints, 5)
+
+# Bottom line: Here's how you make the NPV of a bond with cashflows C
+# at timepoints timepoints when the zero curve is a Nelson-Siegel curve --
+npv(C, timepoints, nsz(14.084,-3.4107,0.0015,1.8832,timepoints))
+# Wow!
+
+# ---------------------------------------------------------------------------
+# Elegant vector notation is amazingly fast (in addition to being beautiful)
+N <- 1e5
+x <- runif(N, -3,3)
+y <- runif(N)
+
+method1 <- function(x,y) {
+  tmp <- NULL
+  for (i in 1:N) {
+    if (x[i] < 0) tmp <- c(tmp, y[i])
+  }
+  tmp
+}
+
+method2 <- function(x,y) {
+  y[x < 0]
+}
+
+s1 <- system.time(ans1 <- method1(x,y))
+s2 <- system.time(ans2 <- method2(x,y))
+all.equal(ans1,ans2)
+s1/s2           # On my machine it's 2000x faster
--- a/MLE.r
+++ b/MLE.r
@ -0,0 +1,66 @@
+# Goal: To do OLS by MLE.
+
+# OLS likelihood function
+# Note: I am going to write the LF using sigma2=sigma^2 and not sigma.
+ols.lf1 <- function(theta, y, X) {
+  beta <- theta[-1]
+  sigma2 <- theta[1]
+  if (sigma2 <= 0) return(NA)
+  n <- nrow(X)
+  e <- y - X%*%beta                                  # t() = matrix transpose
+  logl <- ((-n/2)*log(2*pi)) - ((n/2)*log(sigma2)) - ((t(e)%*%e)/(2*sigma2))
+  return(-logl) # since optim() does minimisation by default.
+}
+
+# Analytical derivatives
+ols.gradient <- function(theta, y, X) {
+  beta <- theta[-1]
+  sigma2 <- theta[1]
+  e <- y - X%*%beta
+  n <- nrow(X)
+
+  g <- numeric(length(theta))
+  g[1] <- (-n/(2*sigma2)) + (t(e)%*%e)/(2*sigma2*sigma2) # d logl / d sigma
+  g[-1] <- (t(X) %*% e)/sigma2                           # d logl / d beta
+
+  return(-g)
+}
+
+X <- cbind(1, runif(1000))
+theta.true <- c(2,4,6) # error variance = 2, intercept = 4, slope = 6.
+y <- X %*% theta.true[-1] + sqrt(theta.true[1]) * rnorm(1000)
+
+# Estimation by OLS --
+d <- summary(lm(y ~ X[,2]))
+theta.ols <- c(sigma2 = d$sigma^2, d$coefficients[,1])
+cat("OLS theta = ", theta.ols, "\n\n")
+
+cat("\nGradient-free (constrained optimisation) --\n")
+optim(c(1,1,1), method="L-BFGS-B", fn=ols.lf1,
+      lower=c(1e-6,-Inf,-Inf), upper=rep(Inf,3), y=y, X=X)
+
+cat("\nUsing the gradient (constrained optimisation) --\n")
+optim(c(1,1,1), method="L-BFGS-B", fn=ols.lf1, gr=ols.gradient,
+      lower=c(1e-6,-Inf,-Inf), upper=rep(Inf,3), y=y, X=X)
+
+cat("\n\nYou say you want a covariance matrix?\n")
+p <- optim(c(1,1,1), method="L-BFGS-B", fn=ols.lf1, gr=ols.gradient,
+           lower=c(1e-6,-Inf,-Inf), upper=rep(Inf,3), hessian=TRUE,
+           y=y, X=X)
+inverted <- solve(p$hessian)
+results <- cbind(p$par, sqrt(diag(inverted)), p$par/sqrt(diag(inverted)))
+colnames(results) <- c("Coefficient", "Std. Err.", "t")
+rownames(results) <- c("Sigma", "Intercept", "X")
+cat("MLE results --\n")
+print(results)
+cat("Compare with the OLS results --\n")
+d$coefficients
+
+# Picture of how the loglikelihood changes if you perturb the sigma
+theta <- theta.ols
+delta.values <- seq(-1.5, 1.5, .01)
+logl.values <- as.numeric(lapply(delta.values,
+                                 function(x) {-ols.lf1(theta+c(x,0,0),y,X)}))
+plot(sqrt(theta[1]+delta.values), logl.values, type="l", lwd=3, col="blue",
+     xlab="Sigma", ylab="Log likelihood")
+grid()
--- a/returns.r
+++ b/returns.r
@ -0,0 +1,28 @@
+# Goal: To do `moving window volatility' of returns.
+
+library(zoo)
+
+# Some data to play with (Nifty on all fridays for calendar 2004) --
+p <- structure(c(1946.05, 1971.9, 1900.65, 1847.55, 1809.75, 1833.65, 1913.6, 1852.65, 1800.3, 1867.7, 1812.2, 1725.1, 1747.5, 1841.1, 1853.55, 1868.95, 1892.45, 1796.1, 1804.45, 1582.4, 1560.2, 1508.75, 1521.1, 1508.45, 1491.2, 1488.5, 1537.5, 1553.2, 1558.8, 1601.6, 1632.3, 1633.4, 1607.2, 1590.35, 1609, 1634.1, 1668.75, 1733.65, 1722.5, 1775.15, 1820.2, 1795, 1779.75, 1786.9, 1852.3, 1872.95, 1872.35, 1901.05, 1996.2, 1969, 2012.1, 2062.7, 2080.5), index = structure(c(12419, 12426, 12433, 12440, 12447, 12454, 12461, 12468, 12475, 12482, 12489, 12496, 12503, 12510, 12517, 12524, 12531, 12538, 12545, 12552, 12559, 12566, 12573, 12580, 12587, 12594, 12601, 12608, 12615, 12622, 12629, 12636, 12643, 12650, 12657, 12664, 12671, 12678, 12685, 12692, 12699, 12706, 12713, 12720, 12727, 12734, 12741, 12748, 12755, 12762, 12769, 12776, 12783), class = "Date"), frequency = 0.142857142857143, class = c("zooreg", "zoo"))
+
+# Shift to returns --
+r <- 100*diff(log(p))
+head(r)
+summary(r)
+sd(r)
+
+# Compute the moving window vol --
+vol <- sqrt(250) * rollapply(r, 20, sd, align = "right")
+
+# A pretty plot --
+plot(vol, type="l", ylim=c(0,max(vol,na.rm=TRUE)),
+     lwd=2, col="purple", xlab="2004",
+     ylab=paste("Annualised sigma, 20-week window"))
+grid()
+legend(x="bottomleft", col=c("purple", "darkgreen"),
+       lwd=c(2,2), bty="n", cex=0.8,
+       legend=c("Annualised 20-week vol (left scale)", "Nifty (right scale)"))
+par(new=TRUE)
+plot(p, type="l", lwd=2, col="darkgreen",
+     xaxt="n", yaxt="n", xlab="", ylab="")
+axis(4)
--- a/regression.r
+++ b/regression.r
@ -0,0 +1,58 @@
+# Goal: To do nonlinear regression, in three ways
+#       By just supplying the function to be fit,
+#       By also supplying the analytical derivatives, and
+#       By having him analytically differentiate the function to be fit.
+#
+# John Fox has a book "An R and S+ companion to applied regression"
+# (abbreviated CAR).
+# An appendix associated with this book, titled
+#   "Nonlinear regression and NLS"
+# is up on the web, and I strongly recommend that you go read it.
+#
+# This file is essentially from there (I have made slight changes).
+
+# First take some data - from the CAR book --
+library(car)
+data(US.pop)
+attach(US.pop)
+plot(year, population, type="l", col="blue")
+
+# So you see, we have a time-series of the US population. We want to
+# fit a nonlinear model to it.
+
+library(stats)                            # Contains nonlinear regression
+time <- 0:20
+pop.mod <- nls(population ~ beta1/(1 + exp(beta2 + beta3*time)),
+  start=list(beta1=350, beta2=4.5, beta3=-0.3), trace=TRUE)
+# You just write in the formula that you want to fit, and supply
+# starting values. "trace=TRUE" makes him show iterations go by.
+
+summary(pop.mod)
+# Add in predicted values into the plot
+lines(year, fitted.values(pop.mod), lwd=3, col="red")
+
+# Look at residuals
+plot(year, residuals(pop.mod), type="b")
+abline(h=0, lty=2)
+
+# Using analytical derivatives:
+model <- function(beta1, beta2, beta3, time) {
+  m <- beta1/(1+exp(beta2+beta3*time))
+  term <- exp(beta2 + beta3*time)
+  gradient <- cbind((1+term)^-1,
+                    -beta1*(1+term)^-2 * term,
+                    -beta1*(1+term)^-2 * term * time)
+  attr(m, 'gradient') <- gradient
+  return(m)
+}
+
+summary(nls(population ~ model(beta1, beta2, beta3, time),
+            start=list(beta1=350, beta2=4.5, beta3=-0.3)))
+
+# Using analytical derivatives, using automatic differentiation (!!!):
+model <- deriv(~ beta1/(1 + exp(beta2+beta3*time)), # rhs of model
+               c('beta1', 'beta2', 'beta3'), # parameter names
+               function(beta1, beta2, beta3, time){} # arguments for result
+               )
+summary(nls(population ~ model(beta1, beta2, beta3, time),
+            start=list(beta1=350, beta2=4.5, beta3=-0.3)))
--- a/sorting.r
+++ b/sorting.r
@ -0,0 +1,18 @@
+# Goal: To do sorting.
+#
+# The approach here needs to be explained. If `i' is a vector of
+# integers, then the data frame D[i,] picks up rows from D based
+# on the values found in `i'.
+#
+# The order() function makes an integer vector which is a correct
+# ordering for the purpose of sorting.
+
+D <- data.frame(x=c(1,2,3,1), y=c(7,19,2,2))
+D
+
+# Sort on x
+indexes <- order(D$x)
+D[indexes,]
+
+# Print out sorted dataset, sorted in reverse by y
+D[rev(order(D$y)),]
--- a/regression.r
+++ b/regression.r
@ -0,0 +1,35 @@
+# Goal: To make a latex table with results of an OLS regression.
+
+# Get an OLS --
+x1 = runif(100)
+x2 = runif(100, 0, 2)
+y = 2 + 3*x1 + 4*x2 + rnorm(100)
+m = lm(y ~ x1 + x2)
+
+# and print it out prettily --
+library(xtable)
+# Bare --
+xtable(m)
+xtable(anova(m))
+
+# Better --
+print.xtable(xtable(m, caption="My regression",
+                    label="t:mymodel",
+                    digits=c(0,3,2,2,3)),
+             type="latex",
+             file="xtable_demo_ols.tex",
+             table.placement = "tp",
+             latex.environments=c("center", "footnotesize"))
+
+print.xtable(xtable(anova(m),
+                    caption="ANOVA of my regression",
+                    label="t:anova_mymodel"),
+             type="latex",
+             file="xtable_demo_anova.tex",
+             table.placement = "tp",
+             latex.environments=c("center", "footnotesize"))
+
+# Read the documentation of xtable. It actually knows how to generate
+# pretty latex tables for a lot more R objects than just OLS results.
+# It can be a workhorse for making tabular out of matrices, and
+# can also generate HTML.
--- a/pictures.r
+++ b/pictures.r
@ -0,0 +1,33 @@
+# Goal: To make a panel of pictures.
+
+par(mfrow=c(3,2))                       # 3 rows, 2 columns.
+
+# Now the next 6 pictures will be placed on these 6 regions. :-)
+
+# Let me take some pains on the 1st
+plot(density(runif(100)), lwd=2)
+text(x=0, y=0.2, "100 uniforms")        # Showing you how to place text at will
+abline(h=0, v=0)
+              # All these statements effect the 1st plot.
+
+x=seq(0.01,1,0.01)
+par(col="blue")                         # default colour to blue.
+
+# 2 --
+plot(x, sin(x), type="l")
+lines(x, cos(x), type="l", col="red")
+
+# 3 --
+plot(x, exp(x), type="l", col="green")
+lines(x, log(x), type="l", col="orange")
+
+# 4 --
+plot(x, tan(x), type="l", lwd=3, col="yellow")
+
+# 5 --
+plot(x, exp(-x), lwd=2)
+lines(x, exp(x), col="green", lwd=3)
+
+# 6 --
+plot(x, sin(x*x), type="l")
+lines(x, sin(1/x), col="pink")
--- a/matrix.r
+++ b/matrix.r
@ -0,0 +1,41 @@
+# Goal: To make latex tabular out of an R matrix
+
+# Setup a nice R object:
+m <- matrix(rnorm(8), nrow=2)
+rownames(m) <- c("Age", "Weight")
+colnames(m) <- c("Person1", "Person2", "Person3", "Person4")
+m
+
+# Translate it into a latex tabular:
+library(xtable)
+xtable(m, digits=rep(3,5))
+
+# Production latex code that goes into a paper or a book --
+print(xtable(m,
+             caption="String",
+             label="t:"),
+             type="latex",
+             file="blah.gen",
+             table.placement="tp",
+             latex.environments=c("center", "footnotesize"))
+# Now you do \input{blah.gen} in your latex file.
+
+
+
+# You're lazy, and want to use R to generate latex tables for you?
+data <- cbind(
+              c(7,9,11,2),
+              c(2,4,19,21)
+              )
+colnames(data) <- c("a","b")
+rownames(data) <- c("x","y","z","a")
+xtable(data)
+
+# or you could do
+data <- rbind(
+              c(7,2),
+              c(9,4),
+              c(11,19),
+              c(2,21)
+              )
+# and the rest goes through identically.
--- a/present.r
+++ b/present.r
@ -0,0 +1,24 @@
+# Goal: To read in a simple data file where date data is present.
+
+# Suppose you have a file "x.data" which looks like this:
+#        1997-07-04,3.1,4
+#        1997-07-05,7.2,19
+#        1997-07-07,1.7,2
+#        1997-07-08,1.1,13
+
+A <- read.table("x.data", sep=",",
+                col.names=c("date", "my1", "my2"))
+A$date <- as.Date(A$date, format="%Y-%m-%d")
+       # Say ?strptime to learn how to use "%" to specify
+       # other date formats. Two examples --
+       # "15/12/2002"  needs "%d/%m/%Y"
+       # "03 Jun 1997" needs "%d %b %Y"
+
+       # Actually, if you're using the ISO 8601 date format, i.e.
+       # "%Y-%m-%d", that's the default setting and you don't need to
+       # specify the format.
+
+A$newcol <- A$my1 + A$my2               # Makes a new column in A
+newvar <- A$my1 - A$my2                 # Makes a new R object "newvar"
+A$my1 <- NULL                           # Delete the `my1' column
+summary(A)                              # Makes summary statistics
--- a/contents.r
+++ b/contents.r
@ -0,0 +1,26 @@
+# Goal: To read in a simple data file, and look around it's contents.
+
+# Suppose you have a file "x.data" which looks like this:
+#        1997,3.1,4
+#        1998,7.2,19
+#        1999,1.7,2
+#        2000,1.1,13
+# To read it in --
+
+A <- read.table("x.data", sep=",",
+                col.names=c("year", "my1", "my2"))
+nrow(A)                                 # Count the rows in A
+
+summary(A$year)                         # The column "year" in data frame A
+                                        # is accessed as A$year
+
+A$newcol <- A$my1 + A$my2               # Makes a new column in A
+newvar <- A$my1 - A$my2                 # Makes a new R object "newvar"
+A$my1 <- NULL                           # Removes the column "my1"
+
+# You might find these useful, to "look around" a dataset --
+str(A)
+summary(A)
+library(Hmisc)          # This requires that you've installed the Hmisc package
+contents(A)
+describe(A)
--- a/is.r
+++ b/is.r
@ -0,0 +1,43 @@
+# Goal: To show amazing R indexing notation, and the use of is.na()
+
+x <- c(2,7,9,2,NA,5)                 # An example vector to play with.
+
+# Give me elems 1 to 3 --
+x[1:3]
+
+# Give me all but elem 1 --
+x[-1]
+
+# Odd numbered elements --
+indexes <- seq(1,6,2)
+x[indexes]
+# or, more compactly,
+x[seq(1,6,2)]
+
+# Access elements by specifying "on" / "off" through booleans --
+require <- c(TRUE,TRUE,FALSE,FALSE,FALSE,FALSE)
+x[require]
+# Short vectors get reused! So, to get odd numbered elems --
+x[c(TRUE,FALSE)]
+
+# Locate missing data --
+is.na(x)
+
+# Replace missing data by 0 --
+x[is.na(x)] <- 0
+x
+
+# Similar ideas work for matrices --
+y <- matrix(c(2,7,9,2,NA,5), nrow=2)
+y
+
+# Make a matrix containing columns 1 and 3 --
+y[,c(1,3)]
+
+# Let us see what is.na(y) does --
+is.na(y)
+str(is.na(y))
+# So is.na(y) gives back a matrix with the identical structure as that of y.
+# Hence I can say
+y[is.na(y)] <- -1
+y
--- a/functions.r
+++ b/functions.r
@ -0,0 +1,42 @@
+# Goals: To write functions
+#        To write functions that send back multiple objects.
+
+# FIRST LEARN ABOUT LISTS --
+X = list(height=5.4, weight=54)
+print("Use default printing --")
+print(X)
+print("Accessing individual elements --")
+cat("Your height is ", X$height, " and your weight is ", X$weight, "\n")
+
+# FUNCTIONS --
+square <- function(x) {
+  return(x*x)
+}
+cat("The square of 3 is ", square(3), "\n")
+
+                 # default value of the arg is set to 5.
+cube <- function(x=5) {
+  return(x*x*x);
+}
+cat("Calling cube with 2 : ", cube(2), "\n")    # will give 2^3
+cat("Calling cube        : ", cube(), "\n")     # will default to 5^3.
+
+# LEARN ABOUT FUNCTIONS THAT RETURN MULTIPLE OBJECTS --
+powers <- function(x) {
+  parcel = list(x2=x*x, x3=x*x*x, x4=x*x*x*x);
+  return(parcel);
+}
+
+X = powers(3);
+print("Showing powers of 3 --"); print(X);
+
+# WRITING THIS COMPACTLY (4 lines instead of 7)
+
+powerful <- function(x) {
+  return(list(x2=x*x, x3=x*x*x, x4=x*x*x*x));
+}
+print("Showing powers of 3 --"); print(powerful(3));
+
+# In R, the last expression in a function is, by default, what is
+# returned. So you could equally just say:
+powerful <- function(x) {list(x2=x*x, x3=x*x*x, x4=x*x*x*x)}
--- a/test.r
+++ b/test.r
@ -0,0 +1,44 @@
+# Goal: Given two vectors of data,
+#       superpose their CDFs
+#       and show the results of the two-sample Kolmogorov-Smirnoff test
+
+# The function consumes two vectors x1 and x2.
+# You have to provide a pair of labels as `legendstrings'.
+# If you supply an xlab, it's used
+# If you specify log - e.g. log="x" - this is passed on to plot.
+# The remaining args that you specify are sent on into ks.test()
+two.cdfs.plot <- function(x1, x2, legendstrings, xlab="", log="", ...) {
+  stopifnot(length(x1)>0,
+            length(x2)>0,
+            length(legendstrings)==2)
+  hilo <- range(c(x1,x2))
+
+  par(mai=c(.8,.8,.2,.2))
+  plot(ecdf(x1), xlim=hilo, verticals=TRUE, cex=0,
+       xlab=xlab, log=log, ylab="Cum. distribution", main="")
+  grid()
+  plot(ecdf(x2), add=TRUE, verticals=TRUE, cex=0, lwd=3)
+  legend(x="bottomright", lwd=c(1,3), lty=1, bty="n",
+         legend=legendstrings)
+
+  k <- ks.test(x1,x2, ...)
+  text(x=hilo[1], y=c(.9,.85), pos=4, cex=.8,
+     labels=c(
+       paste("KS test statistic: ", sprintf("%.3g", k$statistic)),
+       paste("Prob value: ", sprintf("%.3g", k$p.value))
+       )
+     )
+  k
+}
+
+x1 <- rnorm(100, mean=7, sd=1)
+x2 <- rnorm(100, mean=9, sd=1)
+
+# Check error detection --
+two.cdfs.plot(x1,x2)
+
+# Typical use --
+two.cdfs.plot(x1, x2, c("X1","X2"), xlab="Height (metres)", log="x")
+
+# Send args into ks.test() --
+two.cdfs.plot(x1, x2, c("X1","X2"), alternative="less")
--- a/polynomials.r
+++ b/polynomials.r
@ -0,0 +1,61 @@
+# Goal: Experiment with fitting nonlinear functional forms in
+#       OLS, using orthogonal polynomials to avoid difficulties with
+#       near-singular design matrices that occur with ordinary polynomials.
+#       Shriya Anand, Gabor Grothendieck, Ajay Shah, March 2006.
+
+# We will deal with noisy data from the d.g.p. y = sin(x) + e
+x <- seq(0, 2*pi, length.out=50)
+set.seed(101)
+y <- sin(x) + 0.3*rnorm(50)
+basicplot <- function(x, y, minx=0, maxx=3*pi, title="") {
+  plot(x, y, xlim=c(minx,maxx), ylim=c(-2,2), main=title)
+  lines(x, sin(x), col="blue", lty=2, lwd=2)
+  abline(h=0, v=0)
+}
+x.outsample <- seq(0, 3*pi, length.out=100)
+
+# Severe multicollinearity with ordinary polynomials
+x2 <- x*x
+x3 <- x2*x
+x4 <- x3*x
+cor(cbind(x, x2, x3, x4))
+# and a perfect design matrix using orthogonal polynomials
+m <- poly(x, 4)
+all.equal(cor(m), diag(4))              # Correlation matrix is I.
+
+par(mfrow=c(2,2))
+# Ordinary polynomial regression --
+  p <- lm(y ~ x + I(x^2) + I(x^3) + I(x^4))
+  summary(p)
+  basicplot(x, y, title="Polynomial, insample") # Data
+  lines(x, fitted(p), col="red", lwd=3)  # In-sample
+  basicplot(x, y, title="Polynomial, out-of-sample")
+  predictions.p <- predict(p, list(x = x.outsample))    # Out-of-sample
+  lines(x.outsample, predictions.p, type="l", col="red", lwd=3)
+  lines(x.outsample, sin(x.outsample), type="l", col="blue", lwd=2, lty=2)
+  # As expected, polynomial fitting gives terrible results out of sample.
+
+# These IDENTICAL things using orthogonal polynomials
+  d <- lm(y ~ poly(x, 4))
+  summary(d)
+  basicplot(x, y, title="Orth. poly., insample") # Data
+  lines(x, fitted(d), col="red", lwd=3)  # In-sample
+  basicplot(x, y, title="Orth. poly., out-of-sample")
+  predictions.op <- predict(d, list(x = x.outsample))    # Out-of-sample
+  lines(x.outsample, predictions.op, type="l", col="red", lwd=3)
+  lines(x.outsample, sin(x.outsample), type="l", col="blue", lwd=2, lty=2)
+
+# predict(d) is magical! See ?SafePrediction
+# The story runs at two levels. First, when you do an OLS model,
+# predict()ion requires applying coefficients to an appropriate
+# X matrix. But one level deeper, the polynomial or orthogonal-polynomial
+# needs to be utilised for computing the X matrix based on the
+# supplied x.outsample data.
+# If you say p <- poly(x, n)
+# then you can say predict(p, new) where predict.poly() gets invoked.
+# And when you say predict(lm()), the full steps are worked out for
+# you automatically: predict.poly() is used to make an X matrix and
+# then prediction based on the regression results is done.
+
+all.equal(predictions.p, predictions.op) # Both paths are identical for this
+                                         # (tame) problem.
--- a/notation.r
+++ b/notation.r
@ -0,0 +1,26 @@
+# Goal: Utilise matrix notation
+#       We use the problems of portfolio analysis as an example.
+
+# Prices of 4 firms to play with, at weekly frequency (for calendar 2004) --
+p <- structure(c(300.403, 294.604, 291.038, 283.805, 270.773, 275.506, 292.271, 292.837, 284.872, 295.037, 280.939, 259.574, 250.608, 268.84, 266.507, 263.94, 273.173, 238.609, 230.677, 192.847, 219.078, 201.846, 210.279, 193.281, 186.748, 197.314, 202.813, 204.08, 226.044, 242.442, 261.274, 269.173, 256.05, 259.75, 243, 250.3, 263.45, 279.5, 289.55, 291.95, 302.1, 284.4, 283.5, 287.8, 298.3, 307.6, 307.65, 311.9, 327.7, 318.1, 333.6, 358.9, 385.1, 53.6, 51.95, 47.65, 44.8, 44.85, 44.3, 47.1, 44.2, 41.8, 41.9, 41, 35.3, 33.35, 35.6, 34.55, 35.55, 40.05, 35, 34.85, 28.95, 31, 29.25, 29.05, 28.95, 24.95, 26.15, 28.35, 29.4, 32.55, 37.2, 39.85, 40.8, 38.2, 40.35, 37.55, 39.4, 39.8, 43.25, 44.75, 47.25, 49.6, 47.6, 46.35, 49.4, 49.5, 50.05, 50.5, 51.85, 56.35, 54.15, 58, 60.7, 62.7, 293.687, 292.746, 283.222, 286.63, 259.774, 259.257, 270.898, 250.625, 242.401, 248.1, 244.942, 239.384, 237.926, 224.886, 243.959, 270.998, 265.557, 257.508, 258.266, 257.574, 251.917, 250.583, 250.783, 246.6, 252.475, 266.625, 263.85, 249.925, 262.9, 264.975, 273.425, 275.575, 267.2, 282.25, 284.25, 290.75, 295.625, 296.25, 291.375, 302.225, 318.95, 324.825, 320.55, 328.75, 344.05, 345.925, 356.5, 368.275, 374.825, 373.525, 378.325, 378.6, 374.4, 1416.7, 1455.15, 1380.97, 1365.31, 1303.2, 1389.64, 1344.05, 1266.29, 1265.61, 1312.17, 1259.25, 1297.3, 1327.38, 1250, 1328.03, 1347.46, 1326.79, 1286.54, 1304.84, 1272.44, 1227.53, 1264.44, 1304.34, 1277.65, 1316.12, 1370.97, 1423.35, 1382.5, 1477.75, 1455.15, 1553.5, 1526.8, 1479.85, 1546.8, 1565.3, 1606.6, 1654.05, 1689.7, 1613.95, 1703.25, 1708.05, 1786.75, 1779.75, 1906.35, 1976.6, 2027.2, 2057.85, 2029.6, 2051.35, 2033.4, 2089.1, 2065.2, 2091.7), .Dim = c(53, 4), .Dimnames = list(NULL, c("TISCO", "SAIL", "Wipro", "Infosys")))
+# Shift from prices to returns --
+r <- 100*diff(log(p))
+
+# Historical expected returns --
+colMeans(r)
+# Historical correlation matrix --
+cor(r)
+# Historical covariance matrix --
+S <- cov(r)
+S
+
+# Historical portfolio variance for a stated portfolio of 20%,20%,30%,30% --
+w <- c(.2, .2, .3, .3)
+t(w) %*% S %*% w
+
+# The portfolio optimisation function in tseries --
+library(tseries)
+optimised <- portfolio.optim(r)         # This uses the historical facts from r
+optimised$pw                            # Weights
+optimised$pm                            # Expected return using these weights
+optimised$ps                            # Standard deviation of optimised port.
--- a/Arithmetic.r
+++ b/Arithmetic.r
@ -0,0 +1,17 @@
+> sum(2,7,5)
+[1] 14
+
+> x
+[1]  2 NA  3  1  4
+
+> sum(x)    # if any element is NA or NaN, result is NA or NaN
+[1] NA
+
+> sum(x, na.rm=TRUE)    # this way we can ignore NA and NaN values
+[1] 10
+
+> mean(x, na.rm=TRUE)
+[1] 2.5
+
+> prod(x, na.rm=TRUE)
+[1] 24
--- a/r/z=f(x,y)
+++ b/r/z=f(x,y)