Adding R programming language

master
Michael Reber 5 years ago
parent 599b63599b
commit 026079a47d

@ -0,0 +1,31 @@
# Goals: A first look at R objects - vectors, lists, matrices, data frames.
# To make vectors "x" "y" "year" and "names"
x <- c(2,3,7,9)
y <- c(9,7,3,2)
year <- 1990:1993
names <- c("payal", "shraddha", "kritika", "itida")
# Accessing the 1st and last elements of y --
y[1]
y[length(y)]
# To make a list "person" --
person <- list(name="payal", x=2, y=9, year=1990)
person
# Accessing things inside a list --
person$name
person$x
# To make a matrix, pasting together the columns "year" "x" and "y"
# The verb cbind() stands for "column bind"
cbind(year, x, y)
# To make a "data frame", which is a list of vectors of the same length --
D <- data.frame(names, year, x, y)
nrow(D)
# Accessing one of these vectors
D$names
# Accessing the last element of this vector
D$names[nrow(D)]
# Or equally,
D$names[length(D$names)]

@ -0,0 +1,11 @@
# Goal: A histogram with tails shown in red.
# This happened on the R mailing list on 7 May 2004.
# This is by Martin Maechler <maechler@stat.math.ethz.ch>, who was
# responding to a slightly imperfect version of this by
# "Guazzetti Stefano" <Stefano.Guazzetti@ausl.re.it>
x <- rnorm(1000)
hx <- hist(x, breaks=100, plot=FALSE)
plot(hx, col=ifelse(abs(hx$breaks) < 1.669, 4, 2))
# What is cool is that "col" is supplied a vector.

@ -0,0 +1,67 @@
# Goals: ARMA modeling - estimation, diagnostics, forecasting.
# 0. SETUP DATA
rawdata <- c(-0.21,-2.28,-2.71,2.26,-1.11,1.71,2.63,-0.45,-0.11,4.79,5.07,-2.24,6.46,3.82,4.29,-1.47,2.69,7.95,4.46,7.28,3.43,-3.19,-3.14,-1.25,-0.50,2.25,2.77,6.72,9.17,3.73,6.72,6.04,10.62,9.89,8.23,5.37,-0.10,1.40,1.60,3.40,3.80,3.60,4.90,9.60,18.20,20.60,15.20,27.00,15.42,13.31,11.22,12.77,12.43,15.83,11.44,12.32,12.10,12.02,14.41,13.54,11.36,12.97,10.00,7.20,8.74,3.92,8.73,2.19,3.85,1.48,2.28,2.98,4.21,3.85,6.52,8.16,5.36,8.58,7.00,10.57,7.12,7.95,7.05,3.84,4.93,4.30,5.44,3.77,4.71,3.18,0.00,5.25,4.27,5.14,3.53,4.54,4.70,7.40,4.80,6.20,7.29,7.30,8.38,3.83,8.07,4.88,8.17,8.25,6.46,5.96,5.88,5.03,4.99,5.87,6.78,7.43,3.61,4.29,2.97,2.35,2.49,1.56,2.65,2.49,2.85,1.89,3.05,2.27,2.91,3.94,2.34,3.14,4.11,4.12,4.53,7.11,6.17,6.25,7.03,4.13,6.15,6.73,6.99,5.86,4.19,6.38,6.68,6.58,5.75,7.51,6.22,8.22,7.45,8.00,8.29,8.05,8.91,6.83,7.33,8.52,8.62,9.80,10.63,7.70,8.91,7.50,5.88,9.82,8.44,10.92,11.67)
# Make a R timeseries out of the rawdata: specify frequency & startdate
gIIP <- ts(rawdata, frequency=12, start=c(1991,4))
print(gIIP)
plot.ts(gIIP, type="l", col="blue", ylab="IIP Growth (%)", lwd=2,
main="Full data")
grid()
# Based on this, I decide that 4/1995 is the start of the sensible period.
gIIP <- window(gIIP, start=c(1995,4))
print(gIIP)
plot.ts(gIIP, type="l", col="blue", ylab="IIP Growth (%)", lwd=2,
main="Estimation subset")
grid()
# Descriptive statistics about gIIP
mean(gIIP); sd(gIIP); summary(gIIP);
plot(density(gIIP), col="blue", main="(Unconditional) Density of IIP growth")
acf(gIIP)
# 1. ARMA ESTIMATION
m.ar2 <- arima(gIIP, order = c(2,0,0))
print(m.ar2) # Print it out
# 2. ARMA DIAGNOSTICS
tsdiag(m.ar2) # His pretty picture of diagnostics
## Time series structure in errors
print(Box.test(m.ar2$residuals, lag=12, type="Ljung-Box"));
## Sniff for ARCH
print(Box.test(m.ar2$residuals^2, lag=12, type="Ljung-Box"));
## Eyeball distribution of residuals
plot(density(m.ar2$residuals), col="blue", xlim=c(-8,8),
main=paste("Residuals of AR(2)"))
# 3. FORECASTING
## Make a picture of the residuals
plot.ts(m.ar2$residual, ylab="Innovations", col="blue", lwd=2)
s <- sqrt(m.ar2$sigma2)
abline(h=c(-s,s), lwd=2, col="lightGray")
p <- predict(m.ar2, n.ahead = 12) # Make 12 predictions.
print(p)
## Watch the forecastability decay away from fat values to 0.
## sd(x) is the naive sigma. p$se is the prediction se.
gain <- 100*(1-p$se/sd(gIIP))
plot.ts(gain, main="Gain in forecast s.d.", ylab="Per cent",
col="blue", lwd=2)
## Make a pretty picture that puts it all together
ts.plot(gIIP, p$pred, p$pred-1.96*p$se, p$pred+1.96*p$se,
gpars=list(lty=c(1,1,2,2), lwd=c(2,2,1,1),
ylab="IIP growth (%)", col=c("blue","red", "red", "red")))
grid()
abline(h=mean(gIIP), lty=2, lwd=2, col="lightGray")
legend(x="bottomleft", cex=0.8, bty="n",
lty=c(1,1,2,2), lwd=c(2,1,1,2),
col=c("blue", "red", "red", "lightGray"),
legend=c("IIP", "AR(2) forecasts", "95% C.I.", "Mean IIP growth"))

@ -0,0 +1,16 @@
> x
[1] 3 6 8
> y
[1] 2 9 0
> x + y
[1] 5 15 8
> x + 1 # 1 is recycled to (1,1,1)
[1] 4 7 9
> x + c(1,4) # (1,4) is recycled to (1,4,1) but warning issued
[1] 4 10 9
Warning message:
In x + c(1, 4) :
longer object length is not a multiple of shorter object length

@ -0,0 +1,33 @@
# Goal: All manner of import and export of datasets.
# Invent a dataset --
A <- data.frame(
name=c("a","b","c"),
ownership=c("Case 1","Case 1","Case 2"),
listed.at=c("NSE",NA,"BSE"),
# Firm "b" is unlisted.
is.listed=c(TRUE,FALSE,TRUE),
# R convention - boolean variables are named "is.something"
x=c(2.2,3.3,4.4),
date=as.Date(c("2004-04-04","2005-05-05","2006-06-06"))
)
# To a spreadsheet through a CSV file --
write.table(A,file="demo.csv",sep = ",",col.names = NA,qmethod = "double")
B <- read.table("demo.csv", header = TRUE, sep = ",", row.names = 1)
# To R as a binary file --
save(A, file="demo.rda")
load("demo.rda")
# To the Open XML standard for transport for statistical data --
library(StatDataML)
writeSDML(A, "/tmp/demo.sdml")
B <- readSDML("/tmp/demo.sdml")
# To Stata --
library(foreign)
write.dta(A, "/tmp/demo.dta")
B <- read.dta("/tmp/demo.dta")
# foreign::write.foreign() also has a pathway to SAS and SPSS.

@ -0,0 +1,33 @@
# Goal: An example of simulation-based inference.
# This is in the context of testing for time-series dependence in
# stock market returns data.
# The code here does the idea of Kim, Nelson, Startz (1991).
# We want to use the distribution of realworld returns data, without
# needing assumptions about normality.
# The null is lack of dependence (i.e. an efficient market).
# So repeatedly, the data is permuted, and the sample ACF is computed.
# This gives us the distribution of the ACF under H0: independence, but
# while using the empirical distribution of the returns data.
# Weekly returns on Nifty, 1/1/2002 to 31/12/2003, 104 weeks of data.
r <- c(-0.70031182197603, 0.421690133064168, -1.20098072984689, 0.143402360644984, 3.81836537549516, 3.17055939373247, 0.305580301919228, 1.23853814691852, 0.81584795095706, -1.51865139747764, -2.71223626421522, -0.784836480094242, 1.09180041170998, 0.397649587762761, -4.11309534220923, -0.263912425099111, -0.0410144239805454, 1.75756212770972, -2.3335373897992, -2.19228764624217, -3.64578978183987, 1.92535789661354, 3.45782867883164, -2.15532607229374, -0.448039988298987, 1.50124793565896, -1.45871585874362, -2.13459863369767, -6.2128068251802, -1.94482987066289, 0.751294815735637, 1.78244982829590, 1.61567494389745, 1.53557708728931, -1.53557708728931, -0.322061470004265, -2.28394919698225, 0.70399304137414, -2.93580952607737, 2.38125098034425, 0.0617697039252185, -4.14482733720716, 2.04397528093754, 0.576400673606603, 3.43072725191913, 2.96465382864843, 2.89833358015583, 1.85387040058336, 1.52136515035952, -0.637268376944444, 1.75418926224609, -0.804391905851354, -0.861816058320475, 0.576902488444109, -2.84259880663331, -1.35375536139417, 1.49096529042234, -2.05404881010045, 2.86868849528146, -0.258270670200478, -4.4515881438687, -1.73055019137092, 3.04427015714648, -2.94928202352018, 1.62081315773994, -6.83117945164824, -0.962715713711582, -1.75875847071740, 1.50330330252721, -0.0479705789653728, 3.68968303215933, -0.535807567290103, 3.94034871061182, 3.85787174417738, 0.932185956989873, 4.08598654183674, 2.27343783689715, 1.13958830440017, 2.01737201171230, -1.88131458327554, 1.97596267156648, 2.79857144562001, 2.22470306481695, 2.03212951411427, 4.95626853448883, 3.40400972901396, 3.03840139165246, -1.89863129741417, -3.70832135042951, 4.78478922155396, 4.3973589590097, 4.9667050392987, 2.99775078737081, -4.12349101552438, 3.25638269809945, 2.29683376253966, -2.64772825878214, -0.630835277076258, 4.72528848505451, 1.87368447333380, 3.17543946162564, 4.58174427843208, 3.23625985632168, 2.29777651227296)
# The 1st autocorrelation from the sample:
acf(r, 1, plot=FALSE)$acf[2]
# Obtain 1000 draws from the distribution of the 1st autocorrelation
# under the null of independence:
set.seed <- 101
simulated <- replicate(1000, acf(r[sample(1:104, replace=FALSE)], 1, plot=FALSE)$acf[2])
# At 95% --
quantile(simulated, probs=c(.025,.975))
# At 99% --
quantile(simulated, probs=c(.005,.995))
# So we can reject the null at 95% but not at 99%.
# A pretty picture.
plot(density(simulated), col="blue")
abline(v=0)
abline(v=quantile(simulated, probs=c(.025,.975)), lwd=2, col="purple")
abline(v=acf(r, 1, plot=FALSE)$acf[2], lty=2, lwd=4, col="yellow")

@ -0,0 +1,52 @@
# Goal: Associative arrays (as in awk) or hashes (as in perl).
# Or, more generally, adventures in R addressing.
# Here's a plain R vector:
x <- c(2,3,7,9)
# But now I tag every elem with labels:
names(x) <- c("kal","sho","sad","aja")
# Associative array operations:
x["kal"] <- 12
# Pretty printing the entire associative array:
x
# This works for matrices too:
m <- matrix(runif(10), nrow=5)
rownames(m) <- c("violet","indigo","blue","green","yellow")
colnames(m) <- c("Asia","Africa")
# The full matrix --
m
# Or even better --
library(xtable)
xtable(m)
# Now address symbolically --
m[,"Africa"]
m["indigo",]
m["indigo","Africa"]
# The "in" operator, as in awk --
for (colour in c("yellow", "orange", "red")) {
if (colour %in% rownames(m)) {
cat("For Africa and ", colour, " we have ", m[colour, "Africa"], "\n")
} else {
cat("Colour ", colour, " does not exist in the hash.\n")
}
}
# This works for data frames also --
D <- data.frame(m)
D
# Look closely at what happened --
str(D) # The colours are the rownames(D).
# Operations --
D$Africa
D[,"Africa"]
D["yellow",]
# or
subset(D, rownames(D)=="yellow")
colnames(D) <- c("Antarctica","America")
D
D$America

@ -0,0 +1,16 @@
Binary to Decimal
# Program to convert decimal
# number into binary number
# using recursive function
convert_to_binary <- function(n) {
if(n > 1) {
convert_to_binary(as.integer(n/2))
}
cat(n %% 2)
}
Output
> convert_to_binary(52)
110100

@ -0,0 +1,25 @@
Check Armstrong number
# take input from the user
num = as.integer(readline(prompt="Enter a number: "))
# initialize sum
sum = 0
# find the sum of the cube of each digit
temp = num
while(temp > 0) {
digit = temp %% 10
sum = sum + (digit ^ 3)
temp = floor(temp / 10)
}
# display the result
if(num == sum) {
print(paste(num, "is an Armstrong number"))
} else {
print(paste(num, "is not an Armstrong number"))
}
Output 1
Enter a number: 23
[1] "23 is not an Armstrong number"

@ -0,0 +1,27 @@
Check Leap Year
# Program to check if
# the input year is
# a leap year or not
year = as.integer(readline(prompt="Enter a year: "))
if((year %% 4) == 0) {
if((year %% 100) == 0) {
if((year %% 400) == 0) {
print(paste(year,"is a leap year"))
} else {
print(paste(year,"is not a leap year"))
}
} else {
print(paste(year,"is a leap year"))
}
} else {
print(paste(year,"is not a leap year"))
}
Output 1
Enter a year: 1900
[1] "1900 is not a leap year"
Output 2
Enter a year: 2000
[1] "2000 is a leap year"

@ -0,0 +1,21 @@
Check Odd and Even Number
# Program to check if
# the input number is odd or even.
# A number is even if division
# by 2 give a remainder of 0.
# If remainder is 1, it is odd.
num = as.integer(readline(prompt="Enter a number: "))
if((num %% 2) == 0) {
print(paste(num,"is Even"))
} else {
print(paste(num,"is Odd"))
}
Output 1
Enter a number: 89
[1] "89 is Odd"
Output 2
Enter a number: 0
[1] "0 is Even"

@ -0,0 +1,24 @@
Check Positive, Negative or Zero
# In this program, we input a number
# check if the number is positive or
# negative or zero and display
# an appropriate message
num = as.double(readline(prompt="Enter a number: "))
if(num > 0) {
print("Positive number")
} else {
if(num == 0) {
print("Zero")
} else {
print("Negative number")
}
}
Output 1
Enter a number: -9.6
[1] "Negative number"
Output 2
Enter a number: 2
[1] "Positive number"

@ -0,0 +1,27 @@
Check Prime Number
# Program to check if
# the input number is
# prime or not
# take input from the user
num = as.integer(readline(prompt="Enter a number: "))
flag = 0
# prime numbers are greater than 1
if(num > 1) {
# check for factors
flag = 1
for(i in 2:(num-1)) {
if ((num %% i) == 0) {
flag = 0
break
}
}
}
if(num == 2) flag = 1
if(flag == 1) {
print(paste(num,"is a prime number"))
} else {
print(paste(num,"is not a prime number"))
}

@ -0,0 +1,30 @@
Compute LCM in R
# Program to find the L.C.M. of two input number
lcm <- function(x, y) {
# choose the greater number
if(x > y) {
greater = x
} else {
greater = y
}
while(TRUE) {
if((greater %% x == 0) && (greater %% y == 0)) {
lcm = greater
break
}
greater = greater + 1
}
return(lcm)
}
# take input from the user
num1 = as.integer(readline(prompt = "Enter first number: "))
num2 = as.integer(readline(prompt = "Enter second number: "))
print(paste("The L.C.M. of", num1,"and", num2,"is", lcm(num1, num2)))
Output
Enter first number: 24
Enter second number: 25
[1] "The L.C.M. of 24 and 25 is 600"

@ -0,0 +1,15 @@
# Goals: Do bootstrap inference, as an example, for a sample median.
library(boot)
samplemedian <- function(x, d) { # d is a vector of integer indexes
return(median(x[d])) # The genius is in the x[d] notation
}
data <- rnorm(50) # Generate a dataset with 50 obs
b <- boot(data, samplemedian, R=2000) # 2000 bootstrap replications
cat("Sample median has a sigma of ", sd(b$t[,1]), "\n")
plot(b)
# Make a 99% confidence interval
boot.ci(b, conf=0.99, type="basic")

@ -0,0 +1,44 @@
# Goal: Simulate a dataset from the OLS model and obtain
# obtain OLS estimates for it.
x <- runif(100, 0, 10) # 100 draws from U(0,10)
y <- 2 + 3*x + rnorm(100) # beta = [2, 3] and sigma = 1
# You want to just look at OLS results?
summary(lm(y ~ x))
# Suppose x and y were packed together in a data frame --
D <- data.frame(x,y)
summary(lm(y ~ x, D))
# Full and elaborate steps --
d <- lm(y ~ x)
# Learn about this object by saying ?lm and str(d)
# Compact model results --
print(d)
# Pretty graphics for regression diagnostics --
par(mfrow=c(2,2))
plot(d)
d <- summary(d)
# Detailed model results --
print(d)
# Learn about this object by saying ?summary.lm and by saying str(d)
cat("OLS gave slope of ", d$coefficients[2,1],
"and a error sigma of ", d$sigma, "\n")
## I need to drop down to a smaller dataset now --
x <- runif(10)
y <- 2 + 3*x + rnorm(10)
m <- lm(y ~ x)
# Now R supplies a wide range of generic functions which extract
# useful things out of the result of estimation of many kinds of models.
residuals(m)
fitted(m)
AIC(m)
AIC(m, k=log(10)) # SBC
vcov(m)
logLik(m)

@ -0,0 +1,55 @@
# Goal: "Dummy variables" in regression.
# Suppose you have this data:
people = data.frame(
age = c(21,62,54,49,52,38),
education = c("college", "school", "none", "school", "college", "none"),
education.code = c( 2, 1, 0, 1, 2, 0 )
)
# Here people$education is a string categorical variable and
# people$education.code is the same thing, with a numerical coding system.
people
# Note the structure of the dataset --
str(people)
# The strings supplied for `education' have been treated (correctly) as
# a factor, but education.code is being treated as an integer and not as
# a factor.
# We want to do a dummy variable regression. Normally you would have:
# 1 Chosen college as the omitted category
# 2 Made a dummy for "none" named educationnone
# 3 Made a dummy for "school" named educationschool
# 4 Ran a regression like lm(age ~ educationnone + educationschool, people)
# But this is R. Things are cool:
lm(age ~ education, people)
# ! :-)
# When you feed him an explanatory variable like education, he does all
# these steps automatically. (He chose college as the omitted category).
# If you use an integer coding, then the obvious thing goes wrong --
lm(age ~ education.code, people)
# because he's thinking that education.code is an integer explanatory
# variable. So you need to:
lm(age ~ factor(education.code), people)
# (he choose a different omitted category)
# Alternatively, fix up the dataset --
people$education.code <- factor(people$education.code)
lm(age ~ education.code, people)
#
# Bottom line:
# Once the dataset has categorical variables correctly represented as factors, i.e. as
str(people)
# doing OLS in R induces automatic generation of dummy variables while leaving one out:
lm(age ~ education, people)
lm(age ~ education.code, people)
# But what if you want the X matrix?
m <- lm(age ~ education, people)
model.matrix(m)
# This is the design matrix that went into the regression m.

@ -0,0 +1,38 @@
Fibonacci Sequence in R
# Program to diplay the Fibonacci
# sequence up to n-th term using
# recursive functions
recurse_fibonacci <- function(n) {
if(n <= 1) {
return(n)
} else {
return(recurse_fibonacci(n-1) + recurse_fibonacci(n-2))
}
}
# take input from the user
nterms = as.integer(readline(prompt="How many terms? "))
# check if the number of terms is valid
if(nterms <= 0) {
print("Plese enter a positive integer")
} else {
print("Fibonacci sequence:")
for(i in 0:(nterms-1)) {
print(recurse_fibonacci(i))
}
}
Output
How many terms? 9
[1] "Fibonacci sequence:"
[1] 0
[1] 1
[1] 1
[1] 2
[1] 3
[1] 5
[1] 8
[1] 13
[1] 21

@ -0,0 +1,14 @@
Find Factorial of a number using recursion
recur_factorial <- function(n) {
if(n <= 1) {
return(1)
} else {
return(n * recur_factorial(n-1))
}
}
Output
> recur_factorial(5)
[1] 120

@ -0,0 +1,33 @@
Find Minimum and Maximum
> x
[1] 5 8 3 9 2 7 4 6 10
> # find the minimum
> min(x)
[1] 2
> # find the maximum
> max(x)
[1] 10
> # find the range
> range(x)
[1] 2 10
If we want to find where the minimum or maximum is located, i.e. the index instead of the actual value, then we can use which.min() and which.max() functions.
Note that these functions will return the index of the first minimum or maximum in case multiple of them exists.
> x
[1] 5 8 3 9 2 7 4 6 10
> # find index of the minimum
> which.min(x)
[1] 5
> # find index of the minimum
> which.max(x)
[1] 9
> # alternate way to find the minimum
> x[which.min(x)]
[1] 2

@ -0,0 +1,29 @@
Find factors of a number
print_factors <- function(x) {
print(paste("The factors of",x,"are:"))
for(i in 1:x) {
if((x %% i) == 0) {
print(i)
}
}
}
Output
> print_factors(120)
[1] "The factors of 120 are:"
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 8
[1] 10
[1] 12
[1] 15
[1] 20
[1] 24
[1] 30
[1] 40
[1] 60
[1] 120

@ -0,0 +1,21 @@
Find sum of natural numbers without formula
# take input from the user
num = as.integer(readline(prompt = "Enter a number: "))
if(num < 0) {
print("Enter a positive number")
} else {
sum = 0
# use while loop to iterate until zero
while(num > 0) {
sum = sum + num
num = num - 1
}
print(paste("The sum is", sum))
}
Output
Enter a number: 10
[1] "The sum is 55"

@ -0,0 +1,20 @@
#Find the factorial of a number
# take input from the user
num = as.integer(readline(prompt="Enter a number: "))
factorial = 1
# check is the number is negative, positive or zero
if(num < 0) {
print("Sorry, factorial does not exist for negative numbers")
} else if(num == 0) {
print("The factorial of 0 is 1")
} else {
for(i in 1:num) {
factorial = factorial * i
}
print(paste("The factorial of", num ,"is",factorial))
}
Output
Enter a number: 8
[1] "The factorial of 8 is 40320"

@ -0,0 +1,8 @@
> runif(1) # generates 1 random number
[1] 0.3984754
> runif(3) # generates 3 random number
[1] 0.8090284 0.1797232 0.6803607
> runif(3, min=5, max=10) # define the range between 5 and 10
[1] 7.099781 8.355461 5.173133

@ -0,0 +1,22 @@
# Goal:
# A stock is traded on 2 exchanges.
# Price data is missing at random on both exchanges owing to non-trading.
# We want to make a single price time-series utilising information
# from both exchanges. I.e., missing data for exchange 1 will
# be replaced by information for exchange 2 (if observed).
# Let's create some example data for the problem.
e1 <- runif(15) # Prices on exchange 1
e2 <- e1 + 0.05*rnorm(15) # Prices on exchange 2.
cbind(e1, e2)
# Blow away 5 points from each at random.
e1[sample(1:15, 5)] <- NA
e2[sample(1:15, 5)] <- NA
cbind(e1, e2)
# Now how do we reconstruct a time-series that tries to utilise both?
combined <- e1 # Do use the more liquid exchange here.
missing <- is.na(combined)
combined[missing] <- e2[missing] # if it's also missing, I don't care.
cbind(e1, e2, combined)
# There you are.

@ -0,0 +1,43 @@
# Goal: Joint distributions, marginal distributions, useful tables.
# First let me invent some fake data
set.seed(102) # This yields a good illustration.
x <- sample(1:3, 15, replace=TRUE)
education <- factor(x, labels=c("None", "School", "College"))
x <- sample(1:2, 15, replace=TRUE)
gender <- factor(x, labels=c("Male", "Female"))
age <- runif(15, min=20,max=60)
D <- data.frame(age, gender, education)
rm(x,age,gender,education)
print(D)
# Table about education
table(D$education)
# Table about education and gender --
table(D$gender, D$education)
# Joint distribution of education and gender --
table(D$gender, D$education)/nrow(D)
# Add in the marginal distributions also
addmargins(table(D$gender, D$education))
addmargins(table(D$gender, D$education))/nrow(D)
# Generate a good LaTeX table out of it --
library(xtable)
xtable(addmargins(table(D$gender, D$education))/nrow(D),
digits=c(0,2,2,2,2)) # You have to do | and \hline manually.
# Study age by education category
by(D$age, D$gender, mean)
by(D$age, D$gender, sd)
by(D$age, D$gender, summary)
# Two-way table showing average age depending on education & gender
a <- matrix(by(D$age, list(D$gender, D$education), mean), nrow=2)
rownames(a) <- levels(D$gender)
colnames(a) <- levels(D$education)
print(a)
# or, of course,
print(xtable(a))

@ -0,0 +1,23 @@
# Goals: Simulate a dataset from a "fixed effects" model, and
# obtain "least squares dummy variable" (LSDV) estimates.
#
# We do this in the context of a familiar "earnings function" -
# log earnings is quadratic in log experience, with parallel shifts by
# education category.
# Create an education factor with 4 levels --
education <- factor(sample(1:4,1000, replace=TRUE),
labels=c("none", "school", "college", "beyond"))
# Simulate an experience variable with a plausible range --
experience <- 30*runif(1000) # experience from 0 to 20 years
# Make the intercept vary by education category between 4 given values --
intercept <- c(0.5,1,1.5,2)[education]
# Simulate the log earnings --
log.earnings <- intercept +
2*experience - 0.05*experience*experience + rnorm(1000)
A <- data.frame(education, experience, e2=experience*experience, log.earnings)
summary(A)
# The OLS path to LSDV --
summary(lm(log.earnings ~ -1 + education + experience + e2, A))

@ -0,0 +1,31 @@
# Goal: Make a time-series object using the "zoo" package
A <- data.frame(date=c("1995-01-01", "1995-01-02", "1995-01-03", "1995-01-06"),
x=runif(4),
y=runif(4))
A$date <- as.Date(A$date) # yyyy-mm-dd is the default format
# So far there's nothing new - it's just a data frame. I have hand-
# constructed A but you could equally have obtained it using read.table().
# I want to make a zoo matrix out of the numerical columns of A
library(zoo)
B <- A
B$date <- NULL
z <- zoo(as.matrix(B), order.by=A$date)
rm(A, B)
# So now you are holding "z", a "zoo" object. You can do many cool
# things with it.
# See http://www.google.com/search?hl=en&q=zoo+quickref+achim&btnI=I%27m+Feeling+Lucky
# To drop down to a plain data matrix, say
C <- coredata(z)
rownames(C) <- as.character(time(z))
# Compare --
str(C)
str(z)
# The above is a tedious way of doing these things, designed to give you
# an insight into what is going on. If you just want to read a file
# into a zoo object, a very short path is something like:
# z <- read.zoo(filename, format="%d %b %Y")

@ -0,0 +1,20 @@
# Goal: Make pictures in PDF files that can be put into a paper.
xpts <- seq(-3,3,.05)
# Here is my suggested setup for a two-column picture --
pdf("demo2.pdf", width=5.6, height=2.8, bg="cadetblue1", pointsize=8)
par(mai=c(.6,.6,.2,.2))
plot(xpts, sin(xpts*xpts), type="l", lwd=2, col="cadetblue4",
xlab="x", ylab="sin(x*x)")
grid(col="white", lty=1, lwd=.2)
abline(h=0, v=0)
# My suggested setup for a square one-column picture --
pdf("demo1.pdf", width=2.8, height=2.8, bg="cadetblue1", pointsize=8)
par(mai=c(.6,.6,.2,.2))
plot(xpts, sin(xpts*xpts), type="l", lwd=2, col="cadetblue4",
xlab="x", ylab="sin(x*x)")
grid(col="white", lty=1, lwd=.2)
abline(h=0, v=0)

@ -0,0 +1,25 @@
Multiplication Table
# Program to find the multiplication
# table (from 1 to 10)
# of a number input by the user
# take input from the user
num = as.integer(readline(prompt = "Enter a number: "))
# use for loop to iterate 10 times
for(i in 1:10) {
print(paste(num,'x', i, '=', num*i))
}
Output
Enter a number: 7
[1] "7 x 1 = 7"
[1] "7 x 2 = 14"
[1] "7 x 3 = 21"
[1] "7 x 4 = 28"
[1] "7 x 5 = 35"
[1] "7 x 6 = 42"
[1] "7 x 7 = 49"
[1] "7 x 8 = 56"
[1] "7 x 9 = 63"
[1] "7 x 10 = 70"

@ -0,0 +1,13 @@
# Goal: Display two series on one plot, one with a left y axis
# and another with a right y axis.
y1 <- cumsum(rnorm(100))
y2 <- cumsum(rnorm(100, mean=0.2))
par(mai=c(.8, .8, .2, .8))
plot(1:100, y1, type="l", col="blue", xlab="X axis label", ylab="Left legend")
par(new=TRUE)
plot(1:100, y2, type="l", ann=FALSE, yaxt="n")
axis(4)
legend(x="topleft", bty="n", lty=c(1,1), col=c("blue","black"),
legend=c("String 1 (left scale)", "String 2 (right scale)"))

@ -0,0 +1,99 @@
# Goal: Prices and returns
# I like to multiply returns by 100 so as to have "units in percent".
# In other words, I like it for 5% to be a value like 5 rather than 0.05.
###################################################################
# I. Simulate random-walk prices, switch between prices & returns.
###################################################################
# Simulate a time-series of PRICES drawn from a random walk
# where one-period returns are i.i.d. N(mu, sigma^2).
ranrw <- function(mu, sigma, p0=100, T=100) {
cumprod(c(p0, 1 + (rnorm(n=T, mean=mu, sd=sigma)/100)))
}
prices2returns <- function(x) {
100*diff(log(x))
}
returns2prices <- function(r, p0=100) {
c(p0, p0 * exp(cumsum(r/100)))
}
cat("Simulate 25 points from a random walk starting at 1500 --\n")
p <- ranrw(0.05, 1.4, p0=1500, T=25)
# gives you a 25-long series, starting with a price of 1500, where
# one-period returns are N(0.05,1.4^2) percent.
print(p)
cat("Convert to returns--\n")
r <- prices2returns(p)
print(r)
cat("Go back from returns to prices --\n")
goback <- returns2prices(r, 1500)
print(goback)
###################################################################
# II. Plenty of powerful things you can do with returns....
###################################################################
summary(r); sd(r) # summary statistics
plot(density(r)) # kernel density plot
acf(r) # Autocorrelation function
ar(r) # Estimate a AIC-minimising AR model
Box.test(r, lag=2, type="Ljung") # Box-Ljung test
library(tseries)
runs.test(factor(sign(r))) # Runs test
bds.test(r) # BDS test.
###################################################################
# III. Visualisation and the random walk
###################################################################
# I want to obtain intuition into what kinds of price series can happen,
# given a starting price, a mean return, and a given standard deviation.
# This function simulates out 10000 days of a price time-series at a time,
# and waits for you to click in the graph window, after which a second
# series is painted, and so on. Make the graph window very big and
# sit back and admire.
# The point is to eyeball many series and thus obtain some intuition
# into what the random walk does.
visualisation <- function(p0, s, mu, labelstring) {
N <- 10000
x <- (1:(N+1))/250 # Unit of years
while (1) {
plot(x, ranrw(mu, s, p0, N), ylab="Level", log="y",
type="l", col="red", xlab="Time (years)",
main=paste("40 years of a process much like", labelstring))
grid()
z=locator(1)
}
}
# Nifty -- assuming sigma of 1.4% a day and E(returns) of 13% a year
visualisation(2600, 1.4, 13/250, "Nifty")
# The numerical values here are used to think about what the INR/USD
# exchange rate would have looked like if it started from 31.37, had
# a mean depreciation of 5% per year, and had the daily vol of a floating
# exchange rate like EUR/USD.
visualisation(31.37, 0.7, 5/365, "INR/USD (NOT!) with daily sigma=0.7")
# This is of course not like the INR/USD series in the real world -
# which is neither a random walk nor does it have a vol of 0.7% a day.
# The numerical values here are used to think about what the USD/EUR
# exchange rate, starting with 1, having no drift, and having the observed
# daily vol of 0.7. (This is about right).
visualisation(1, 0.7, 0, "USD/EUR with no drift")
###################################################################
# IV. A monte carlo experiment about the runs test
###################################################################
# Measure the effectiveness of the runs test when faced with an
# AR(1) process of length 100 with a coeff of 0.1
set.seed(101)
one.ts <- function() {arima.sim(list(order = c(1,0,0), ar = 0.1), n=100)}
table(replicate(1000, runs.test(factor(sign(one.ts())))$p.value < 0.05))
# We find that the runs test throws up a prob value of below 0.05
# for 91 out of 1000 experiments.
# Wow! :-)
# To understand this, you need to look up the man pages of:
# set.seed, arima.sim, sign, factor, runs.test, replicate, table.
# e.g. say ?replicate

@ -0,0 +1,41 @@
Print Fibonacci Sequence
# take input from the user
nterms = as.integer(readline(prompt="How many terms? "))
# first two terms
n1 = 0
n2 = 1
count = 2
# check if the number of terms is valid
if(nterms <= 0) {
print("Plese enter a positive integer")
} else {
if(nterms == 1) {
print("Fibonacci sequence:")
print(n1)
} else {
print("Fibonacci sequence:")
print(n1)
print(n2)
while(count < nterms) {
nth = n1 + n2
print(nth)
# update values
n1 = n2
n2 = nth
count = count + 1
}
}
}
Output
How many terms? 7
[1] "Fibonacci sequence:"
[1] 0
[1] 1
[1] 1
[1] 2
[1] 3
[1] 5
[1] 8

@ -0,0 +1,30 @@
Program to Find GCD
# Program to find the
# H.C.F of two input number
# define a function
hcf <- function(x, y) {
# choose the smaller number
if(x > y) {
smaller = y
} else {
smaller = x
}
for(i in 1:smaller) {
if((x %% i == 0) && (y %% i == 0)) {
hcf = i
}
}
return(hcf)
}
# take input from the user
num1 = as.integer(readline(prompt = "Enter first number: "))
num2 = as.integer(readline(prompt = "Enter second number: "))
print(paste("The H.C.F. of", num1,"and", num2,"is", hcf(num1, num2)))
Output
Enter first number: 72
Enter second number: 120
[1] "The H.C.F. of 72 and 120 is 24"

@ -0,0 +1,52 @@
# Get the data in place --
load(file="demo.rda")
summary(firms)
# Look at it --
plot(density(log(firms$mktcap)))
plot(firms$mktcap, firms$spread, type="p", cex=.2, col="blue", log="xy",
xlab="Market cap (Mln USD)", ylab="Bid/offer spread (bps)")
m=lm(log(spread) ~ log(mktcap), firms)
summary(m)
# Making deciles --
library(gtools)
library(gdata)
# for deciles (default=quartiles)
size.category = quantcut(firms$mktcap, q=seq(0, 1, 0.1), labels=F)
table(size.category)
means = aggregate(firms, list(size.category), mean)
print(data.frame(means$mktcap,means$spread))
# Make a picture combining the sample mean of spread (in each decile)
# with the weighted average sample mean of the spread (in each decile),
# where weights are proportional to size.
wtd.means = by(firms, size.category,
function(piece) (sum(piece$mktcap*piece$spread)/sum(piece$mktcap)))
lines(means$mktcap, means$spread, type="b", lwd=2, col="green", pch=19)
lines(means$mktcap, wtd.means, type="b", lwd=2, col="red", pch=19)
legend(x=0.25, y=0.5, bty="n",
col=c("blue", "green", "red"),
lty=c(0, 1, 1), lwd=c(0,2,2),
pch=c(0,19,19),
legend=c("firm", "Mean spread in size deciles",
"Size weighted mean spread in size deciles"))
# Within group standard deviations --
aggregate(firms, list(size.category), sd)
# Now I do quartiles by BOTH mktcap and spread.
size.quartiles = quantcut(firms$mktcap, labels=F)
spread.quartiles = quantcut(firms$spread, labels=F)
table(size.quartiles, spread.quartiles)
# Re-express everything as joint probabilities
table(size.quartiles, spread.quartiles)/nrow(firms)
# Compute cell means at every point in the joint table:
aggregate(firms, list(size.quartiles, spread.quartiles), mean)
# Make pretty two-way tables
aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, nobs)
aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, mean)
aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, sd)
aggregate.table(firms$spread, size.quartiles, spread.quartiles, mean)
aggregate.table(firms$spread, size.quartiles, spread.quartiles, sd)

@ -0,0 +1,11 @@
> # We can use the print() function
> print("Hello World!")
[1] "Hello World!"
> # Quotes can be suppressed in the output
> print("Hello World!", quote = FALSE)
[1] Hello World!
> # If there are more than 1 item, we can concatenate using paste()
> print(paste("How","are","you?"))
[1] "How are you?"

@ -0,0 +1,12 @@
# Goal: R syntax where model specification is an argument to a function.
# Invent a dataset
x <- runif(100); y <- runif(100); z <- 2 + 3*x + 4*y + rnorm(100)
D <- data.frame(x=x, y=y, z=z)
amodel <- function(modelstring) {
summary(lm(modelstring, D))
}
amodel(z ~ x)
amodel(z ~ y)

@ -0,0 +1,3 @@
# R programming language
R is a programming language designed for statistical computing and graphics purposes. Contains code that can be executed within the R software environment.

@ -0,0 +1,43 @@
# Goal: Reading and writing ascii files, reading and writing binary files.
# And, to measure how much faster it is working with binary files.
# First manufacture a tall data frame:
# FYI -- runif(10) yields 10 U(0,1) random numbers.
B = data.frame(x1=runif(100000), x2=runif(100000), x3=runif(100000))
summary(B)
# Write out ascii file:
write.table(B, file = "/tmp/foo.csv", sep = ",", col.names = NA)
# Read in this resulting ascii file:
C=read.table("/tmp/foo.csv", header = TRUE, sep = ",", row.names=1)
# Write a binary file out of dataset C:
save(C, file="/tmp/foo.binary")
# Delete the dataset C:
rm(C)
# Restore from foo.binary:
load("/tmp/foo.binary")
summary(C) # should yield the same results
# as summary(B) above.
# Now we time all these operations --
cat("Time creation of dataset:\n")
system.time({
B = data.frame(x1=runif(100000), x2=runif(100000), x3=runif(100000))
})
cat("Time writing an ascii file out of dataset B:\n")
system.time(
write.table(B, file = "/tmp/foo.csv", sep = ",", col.names = NA)
)
cat("Time reading an ascii file into dataset C:\n")
system.time(
{C=read.table("/tmp/foo.csv", header = TRUE, sep=",", row.names=1)
})
cat("Time writing a binary file out of dataset C:\n")
system.time(save(C, file="/tmp/foo.binary"))
cat("Time reading a binary file + variablenames from /tmp/foo.binary:\n")
system.time(load("/tmp/foo.binary")) # and then read it in from binary file

@ -0,0 +1,15 @@
# Goal: Reading in a Microsoft .xls file directly
library(gdata)
a <- read.xls("file.xls", sheet=2) # This reads in the 2nd sheet
# Look at what the cat dragged in
str(a)
# If you have a date column, you'll want to fix it up like this:
a$date <- as.Date(as.character(a$X), format="%d-%b-%y")
a$X <- NULL
# Also see http://tolstoy.newcastle.edu.au/R/help/06/04/25674.html for
# another path.

@ -0,0 +1,17 @@
# Goal: To read in files produced by CMIE's "Business Beacon".
# This assumes you have made a file of MONTHLY data using CMIE's
# Business Beacon program. This contains 2 columns: M3 and M0.
A <- read.table(
# Generic to all BB files --
sep="|", # CMIE's .txt file is pipe delimited
skip=3, # Skip the 1st 3 lines
na.strings=c("N.A.","Err"), # The ways they encode missing data
# Specific to your immediate situation --
file="bb_data.text",
col.names=c("junk", "date", "M3", "M0")
)
A$junk <- NULL # Blow away this column
# Parse the CMIE-style "Mmm yy" date string that's used on monthly data
A$date <- as.Date(paste("1", as.character(A$date)), format="%d %b %Y")

@ -0,0 +1,16 @@
> # sample with replacement
> sample(x, replace = TRUE)
[1] 15 17 13 9 5 15 11 15 1
> # if we simply pass in a positive number n, it will sample
> # from 1:n without replacement
> sample(10)
[1] 2 4 7 9 1 3 10 5 8 6
An example to simulate a coin toss for 10 times.
> sample(c("H","T"),10, replace = TRUE)
[1] "H" "H" "H" "T" "H" "T" "H" "H" "H" "T"

@ -0,0 +1,40 @@
# Goals: Scare the hell out of children with the Cauchy distribution.
# A function which simulates N draws from one of two distributions,
# and returns the mean obtained thusly.
one.simulation <- function(N=100, distribution="normal") {
if (distribution == "normal") {
x <- rnorm(N)
} else {
x <- rcauchy(N)
}
mean(x)
}
k1 <- density(replicate(1000, one.simulation(20)))
k2 <- density(replicate(1000, one.simulation(20, distribution="cauchy")))
xrange <- range(k1$x, k2$x)
plot(k1$x, k1$y, xlim=xrange, type="l", xlab="Estimated value", ylab="")
grid()
lines(k2$x, k2$y, col="red")
abline(v=.5)
legend(x="topleft", bty="n",
lty=c(1,1),
col=c("black", "red"),
legend=c("Mean of Normal", "Mean of Cauchy"))
# The distribution of the mean of normals collapses into a point;
# that of the cauchy does not.
# Here's more scary stuff --
for (i in 1:10) {
cat("Sigma of distribution of 1000 draws from mean of normal - ",
sd(replicate(1000, one.simulation(20))), "\n")
}
for (i in 1:10) {
cat("Sigma of distribution of 1000 draws from mean of cauchy - ",
sd(replicate(1000, one.simulation(20, distribution="cauchy"))), "\n")
}
# Exercise for the reader: Compare the distribution of the median of
# the Normal against the distribution of the median of the Cauchy.

@ -0,0 +1,33 @@
# Goals: Lots of times, you need to give an R object to a friend,
# or embed data into an email.
# First I invent a little dataset --
set.seed(101) # To make sure you get the same random numbers as me
# FYI -- runif(10) yields 10 U(0,1) random numbers.
A = data.frame(x1=runif(10), x2=runif(10), x3=runif(10))
# Look at it --
print(A)
# Writing to a binary file that can be transported
save(A, file="/tmp/my_data_file.rda") # You can give this file to a friend
load("/tmp/my_data_file.rda")
# Plan B - you want pure ascii, which can be put into an email --
dput(A)
# This gives you a block of R code. Let me utilise that generated code
# to create a dataset named "B".
B <- structure(list(x1 = c(0.372198376338929, 0.0438248154241592,
0.709684018278494, 0.657690396532416, 0.249855723232031, 0.300054833060130,
0.584866625955328, 0.333467143354937, 0.622011963743716, 0.54582855431363
), x2 = c(0.879795730113983, 0.706874740775675, 0.731972594512627,
0.931634427979589, 0.455120594473556, 0.590319729177281, 0.820436094887555,
0.224118480458856, 0.411666829371825, 0.0386105608195066), x3 = c(0.700711545301601,
0.956837461562827, 0.213352001970634, 0.661061500199139, 0.923318882007152,
0.795719761401415, 0.0712125543504953, 0.389407767681405, 0.406451216200367,
0.659355078125373)), .Names = c("x1", "x2", "x3"), row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"), class = "data.frame")
# Verify that A and B are near-identical --
A-B
# or,
all.equal(A,B)

@ -0,0 +1,18 @@
# Goal: Display of a macroeconomic time-series, with a filled colour
# bar showing a recession.
years <- 1950:2000
timeseries <- cumsum(c(100, runif(50)*5))
hilo <- range(timeseries)
plot(years, timeseries, type="l", lwd=3)
# A recession from 1960 to 1965 --
polygon(x=c(1960,1960, 1965,1965),
y=c(hilo, rev(hilo)),
density=NA, col="orange", border=NA)
lines(years, timeseries, type="l", lwd=3) # paint again so line comes on top
# alternative method -- though not as good looking --
# library(plotrix)
# gradient.rect(1960, hilo[1], 1965, hilo[2],
# reds=c(0,1), greens=c(0,0), blues=c(0,0),
# gradient="y")

@ -0,0 +1,26 @@
# Goal: Show the efficiency of the mean when compared with the median
# using a large simulation where both estimators are applied on
# a sample of U(0,1) uniformly distributed random numbers.
one.simulation <- function(N=100) { # N defaults to 100 if not supplied
x <- runif(N)
return(c(mean(x), median(x)))
}
# Simulation --
results <- replicate(100000, one.simulation(20)) # Gives back a 2x100000 matrix
# Two kernel densities --
k1 <- density(results[1,]) # results[1,] is the 1st row
k2 <- density(results[2,])
# A pretty picture --
xrange <- range(k1$x, k2$x)
plot(k1$x, k1$y, xlim=xrange, type="l", xlab="Estimated value", ylab="")
grid()
lines(k2$x, k2$y, col="red")
abline(v=.5)
legend(x="topleft", bty="n",
lty=c(1,1),
col=c("black", "red"),
legend=c("Mean", "Median"))

@ -0,0 +1,48 @@
Simple Calculator in R
# Program make a simple calculator
# that can add, subtract, multiply
# and divide using functions
add <- function(x, y) {
return(x + y)
}
subtract <- function(x, y) {
return(x - y)
}
multiply <- function(x, y) {
return(x * y)
}
divide <- function(x, y) {
return(x / y)
}
# take input from the user
print("Select operation.")
print("1.Add")
print("2.Subtract")
print("3.Multiply")
print("4.Divide")
choice = as.integer(readline(prompt="Enter choice[1/2/3/4]: "))
num1 = as.integer(readline(prompt="Enter first number: "))
num2 = as.integer(readline(prompt="Enter second number: "))
operator <- switch(choice,"+","-","*","/")
result <- switch(choice, add(num1, num2), subtract(num1, num2), multiply(num1, num2), divide(num1, num2))
print(paste(num1, operator, num2, "=", result))
Output
[1] "Select operation."
[1] "1.Add"
[1] "2.Subtract"
[1] "3.Multiply"
[1] "4.Divide"
Enter choice[1/2/3/4]: 4
Enter first number: 20
Enter second number: 4
[1] "20 / 4 = 5"

@ -0,0 +1,53 @@
# Goal: Simulation to study size and power in a simple problem.
set.seed(101)
# The data generating process: a simple uniform distribution with stated mean
dgp <- function(N,mu) {runif(N)-0.5+mu}
# Simulate one FIXED hypothesis test for H0:mu=0, given a true mu for a sample size N
one.test <- function(N, truemu) {
x <- dgp(N,truemu)
muhat <- mean(x)
s <- sd(x)/sqrt(N)
# Under the null, the distribution of the mean has standard error s
threshold <- 1.96*s
(muhat < -threshold) || (muhat > threshold)
} # Return of TRUE means reject the null
# Do one experiment, where the fixed H0:mu=0 is run Nexperiments times with a sample size N.
# We return only one number: the fraction of the time that H0 is rejected.
experiment <- function(Nexperiments, N, truemu) {
sum(replicate(Nexperiments, one.test(N, truemu)))/Nexperiments
}
# Measure the size of a test, i.e. rejections when H0 is true
experiment(10000, 50, 0)
# Measurement with sample size of 50, and true mu of 0.
# Power study: I.e. Pr(rejection) when H0 is false
# (one special case in here is when the H0 is actually true)
muvalues <- seq(-.15,.15,.01)
# When true mu < -0.15 and when true mu > 0.15,
# the Pr(rejection) veers to 1 (full power) and it's not interesting.
# First do this with sample size of 50
results <- NULL
for (truth in muvalues) {
results <- c(results, experiment(10000, 50, truth))
}
par(mai=c(.8,.8,.2,.2))
plot(muvalues, results, type="l", lwd=2, ylim=c(0,1),
xlab="True mu", ylab="Pr(Rejection of H0:mu=0)")
abline(h=0.05, lty=2)
# Now repeat this with sample size of 100 (should yield a higher power)
results <- NULL
for (truth in muvalues) {
results <- c(results, experiment(10000, 100, truth))
}
lines(muvalues, results, lwd=2, col="blue")
legend(x=-0.15, y=.2, lwd=c(2,1,2), lty=c(1,2,1), cex=.8,
col=c("black","black","blue"), bty="n",
legend=c("N=50", "Size, 0.05", "N=100"))

@ -0,0 +1,25 @@
Sort a Vector
> x
[1] 7 1 8 3 2 6 5 2 2 4
> # sort in ascending order
> sort(x)
[1] 1 2 2 2 3 4 5 6 7 8
> # sort in descending order
> sort(x, decreasing=TRUE)
[1] 8 7 6 5 4 3 2 2 2 1
> # vector x remains unaffected
> x
[1] 7 1 8 3 2 6 5 2 2 4
Sometimes we would want the index of the sorted vector instead of the values. In such case we can use the function order().
> order(x)
[1] 2 5 8 9 4 10 7 6 1 3
> order(x, decreasing=TRUE)
[1] 3 1 6 7 10 4 5 8 9 2
> x[order(x)] # this will also sort x
[1] 1 2 2 2 3 4 5 6 7 8

@ -0,0 +1,22 @@
# Goal: Special cases in reading files
# Reading in a .bz2 file --
read.table(bzfile("file.text.bz2")) # Requires you have ./file.text.bz2
# Reading in a .gz file --
read.table(gzfile("file.text.gz")) # Requires you have ./file.text.bz2
# Reading from a pipe --
mydata <- read.table(pipe("awk -f filter.awk input.txt"))
# Reading from a URL --
read.table(url("http://www.mayin.org/ajayshah/A/demo.text"))
# This also works --
read.table("http://www.mayin.org/ajayshah/A/demo.text")
# Hmm, I couldn't think of how to read a .bz2 file from a URL. How about:
read.table(pipe("links -source http://www.mayin.org/ajayshah/A/demo.text.bz2 | bunzip2"))
# Reading binary files from a URL --
load(url("http://www.mayin.org/ajayshah/A/nifty_weekly_returns.rda"))

@ -0,0 +1,30 @@
# Goal: Standard computations with well-studied distributions.
# The normal distribution is named "norm". With this, we have:
# Normal density
dnorm(c(-1.96,0,1.96))
# Cumulative normal density
pnorm(c(-1.96,0,1.96))
# Inverse of this
qnorm(c(0.025,.5,.975))
pnorm(qnorm(c(0.025,.5,.975)))
# 1000 random numbers from the normal distribution
summary(rnorm(1000))
# Here's the same ideas, for the chi-squared distribution with 10 degrees
# of freedom.
dchisq(c(0,5,10), df=10)
# Cumulative normal density
pchisq(c(0,5,10), df=10)
# Inverse of this
qchisq(c(0.025,.5,.975), df=10)
# 1000 random numbers from the normal distribution
summary(rchisq(1000, df=10))

@ -0,0 +1,24 @@
# Goal: Some of the standard tests
# A classical setting --
x <- runif(100, 0, 10) # 100 draws from U(0,10)
y <- 2 + 3*x + rnorm(100) # beta = [2, 3] and sigma is 1
d <- lm(y ~ x)
# CLS results --
summary(d)
library(sandwich)
library(lmtest)
# Durbin-Watson test --
dwtest(d, alternative="two.sided")
# Breusch-Pagan test --
bptest(d)
# Heteroscedasticity and autocorrelation consistent (HAC) tests
coeftest(d, vcov=kernHAC)
# Tranplant the HAC values back in --
library(xtable)
sum.d <- summary(d)
xtable(sum.d)
sum.d$coefficients[1:2,1:4] <- coeftest(d, vcov=kernHAC)[1:2,1:4]
xtable(sum.d)

@ -0,0 +1,13 @@
my.name <- readline(prompt="Enter name: ")
my.age <- readline(prompt="Enter age: ")
# convert character into integer
my.age <- as.integer(my.age)
print(paste("Hi,", my.name, "next year you will be", my.age+1, "years old."))
Output
Enter name: Mary
Enter age: 17
[1] "Hi, Mary next year you will be 18 years old."

@ -0,0 +1,116 @@
# Goal: The amazing R vector notation.
cat("EXAMPLE 1: sin(x) for a vector --\n")
# Suppose you have a vector x --
x = c(0.1,0.6,1.0,1.5)
# The bad way --
n = length(x)
r = numeric(n)
for (i in 1:n) {
r[i] = sin(x[i])
}
print(r)
# The good way -- don't use loops --
print(sin(x))
cat("\n\nEXAMPLE 2: Compute the mean of every row of a matrix --\n")
# Here's another example. It isn't really about R; it's about thinking in
# matrix notation. But still.
# Let me setup a matrix --
N=4; M=100;
r = matrix(runif(N*M), N, M)
# So I face a NxM matrix
# [r11 r12 ... r1N]
# [r21 r22 ... r2N]
# [r32 r32 ... r3N]
# My goal: each column needs to be reduced to a mean.
# Method 1 uses loops:
mean1 = numeric(M)
for (i in 1:M) {
mean1[i] = mean(r[,i])
}
# Alternatively, just say:
mean2 = rep(1/N, N) %*% r # Pretty!
# The two answers are the same --
all.equal(mean1,mean2[,])
#
# As an aside, I should say that you can do this directly by using
# the rowMeans() function. But the above is more about pedagogy rather
# than showing you how to get rowmeans.
cat("\n\nEXAMPLE 3: Nelson-Siegel yield curve\n")
# Write this asif you're dealing with scalars --
# Nelson Siegel function
nsz <- function(b0, b1, b2, tau, t) {
tmp = t/tau
tmp2 = exp(-tmp)
return(b0 + ((b1+b2)*(1-tmp2)/(tmp)) - (b2*tmp2))
}
timepoints <- c(0.01,1:5)
# The bad way:
z <- numeric(length(timepoints))
for (i in 1:length(timepoints)) {
z[i] <- nsz(14.084,-3.4107,0.0015,1.8832,timepoints[i])
}
print(z)
# The R way --
print(z <- nsz(14.084,-3.4107,0.0015,1.8832,timepoints))
cat("\n\nEXAMPLE 3: Making the NPV of a bond--\n")
# You know the bad way - sum over all cashflows, NPVing each.
# Now look at the R way.
C = rep(100, 6)
nsz(14.084,-3.4107,0.0015,1.8832,timepoints) # Print interest rates
C/((1.05)^timepoints) # Print cashflows discounted @ 5%
C/((1 + (0.01*nsz(14.084,-3.4107,0.0015,1.8832,timepoints))^timepoints)) # Using NS instead of 5%
# NPV in two different ways --
C %*% (1 + (0.01*nsz(14.084,-3.4107,0.0015,1.8832,timepoints)))^-timepoints
sum(C * (1 + (0.01*nsz(14.084,-3.4107,0.0015,1.8832,timepoints)))^-timepoints)
# You can drop back to a flat yield curve at 5% easily --
sum(C * 1.05^-timepoints)
# Make a function for NPV --
npv <- function(C, timepoints, r) {
return(sum(C * (1 + (0.01*r))^-timepoints))
}
npv(C, timepoints, 5)
# Bottom line: Here's how you make the NPV of a bond with cashflows C
# at timepoints timepoints when the zero curve is a Nelson-Siegel curve --
npv(C, timepoints, nsz(14.084,-3.4107,0.0015,1.8832,timepoints))
# Wow!
# ---------------------------------------------------------------------------
# Elegant vector notation is amazingly fast (in addition to being beautiful)
N <- 1e5
x <- runif(N, -3,3)
y <- runif(N)
method1 <- function(x,y) {
tmp <- NULL
for (i in 1:N) {
if (x[i] < 0) tmp <- c(tmp, y[i])
}
tmp
}
method2 <- function(x,y) {
y[x < 0]
}
s1 <- system.time(ans1 <- method1(x,y))
s2 <- system.time(ans2 <- method2(x,y))
all.equal(ans1,ans2)
s1/s2 # On my machine it's 2000x faster

@ -0,0 +1,66 @@
# Goal: To do OLS by MLE.
# OLS likelihood function
# Note: I am going to write the LF using sigma2=sigma^2 and not sigma.
ols.lf1 <- function(theta, y, X) {
beta <- theta[-1]
sigma2 <- theta[1]
if (sigma2 <= 0) return(NA)
n <- nrow(X)
e <- y - X%*%beta # t() = matrix transpose
logl <- ((-n/2)*log(2*pi)) - ((n/2)*log(sigma2)) - ((t(e)%*%e)/(2*sigma2))
return(-logl) # since optim() does minimisation by default.
}
# Analytical derivatives
ols.gradient <- function(theta, y, X) {
beta <- theta[-1]
sigma2 <- theta[1]
e <- y - X%*%beta
n <- nrow(X)
g <- numeric(length(theta))
g[1] <- (-n/(2*sigma2)) + (t(e)%*%e)/(2*sigma2*sigma2) # d logl / d sigma
g[-1] <- (t(X) %*% e)/sigma2 # d logl / d beta
return(-g)
}
X <- cbind(1, runif(1000))
theta.true <- c(2,4,6) # error variance = 2, intercept = 4, slope = 6.
y <- X %*% theta.true[-1] + sqrt(theta.true[1]) * rnorm(1000)
# Estimation by OLS --
d <- summary(lm(y ~ X[,2]))
theta.ols <- c(sigma2 = d$sigma^2, d$coefficients[,1])
cat("OLS theta = ", theta.ols, "\n\n")
cat("\nGradient-free (constrained optimisation) --\n")
optim(c(1,1,1), method="L-BFGS-B", fn=ols.lf1,
lower=c(1e-6,-Inf,-Inf), upper=rep(Inf,3), y=y, X=X)
cat("\nUsing the gradient (constrained optimisation) --\n")
optim(c(1,1,1), method="L-BFGS-B", fn=ols.lf1, gr=ols.gradient,
lower=c(1e-6,-Inf,-Inf), upper=rep(Inf,3), y=y, X=X)
cat("\n\nYou say you want a covariance matrix?\n")
p <- optim(c(1,1,1), method="L-BFGS-B", fn=ols.lf1, gr=ols.gradient,
lower=c(1e-6,-Inf,-Inf), upper=rep(Inf,3), hessian=TRUE,
y=y, X=X)
inverted <- solve(p$hessian)
results <- cbind(p$par, sqrt(diag(inverted)), p$par/sqrt(diag(inverted)))
colnames(results) <- c("Coefficient", "Std. Err.", "t")
rownames(results) <- c("Sigma", "Intercept", "X")
cat("MLE results --\n")
print(results)
cat("Compare with the OLS results --\n")
d$coefficients
# Picture of how the loglikelihood changes if you perturb the sigma
theta <- theta.ols
delta.values <- seq(-1.5, 1.5, .01)
logl.values <- as.numeric(lapply(delta.values,
function(x) {-ols.lf1(theta+c(x,0,0),y,X)}))
plot(sqrt(theta[1]+delta.values), logl.values, type="l", lwd=3, col="blue",
xlab="Sigma", ylab="Log likelihood")
grid()

@ -0,0 +1,28 @@
# Goal: To do `moving window volatility' of returns.
library(zoo)
# Some data to play with (Nifty on all fridays for calendar 2004) --
p <- structure(c(1946.05, 1971.9, 1900.65, 1847.55, 1809.75, 1833.65, 1913.6, 1852.65, 1800.3, 1867.7, 1812.2, 1725.1, 1747.5, 1841.1, 1853.55, 1868.95, 1892.45, 1796.1, 1804.45, 1582.4, 1560.2, 1508.75, 1521.1, 1508.45, 1491.2, 1488.5, 1537.5, 1553.2, 1558.8, 1601.6, 1632.3, 1633.4, 1607.2, 1590.35, 1609, 1634.1, 1668.75, 1733.65, 1722.5, 1775.15, 1820.2, 1795, 1779.75, 1786.9, 1852.3, 1872.95, 1872.35, 1901.05, 1996.2, 1969, 2012.1, 2062.7, 2080.5), index = structure(c(12419, 12426, 12433, 12440, 12447, 12454, 12461, 12468, 12475, 12482, 12489, 12496, 12503, 12510, 12517, 12524, 12531, 12538, 12545, 12552, 12559, 12566, 12573, 12580, 12587, 12594, 12601, 12608, 12615, 12622, 12629, 12636, 12643, 12650, 12657, 12664, 12671, 12678, 12685, 12692, 12699, 12706, 12713, 12720, 12727, 12734, 12741, 12748, 12755, 12762, 12769, 12776, 12783), class = "Date"), frequency = 0.142857142857143, class = c("zooreg", "zoo"))
# Shift to returns --
r <- 100*diff(log(p))
head(r)
summary(r)
sd(r)
# Compute the moving window vol --
vol <- sqrt(250) * rollapply(r, 20, sd, align = "right")
# A pretty plot --
plot(vol, type="l", ylim=c(0,max(vol,na.rm=TRUE)),
lwd=2, col="purple", xlab="2004",
ylab=paste("Annualised sigma, 20-week window"))
grid()
legend(x="bottomleft", col=c("purple", "darkgreen"),
lwd=c(2,2), bty="n", cex=0.8,
legend=c("Annualised 20-week vol (left scale)", "Nifty (right scale)"))
par(new=TRUE)
plot(p, type="l", lwd=2, col="darkgreen",
xaxt="n", yaxt="n", xlab="", ylab="")
axis(4)

@ -0,0 +1,58 @@
# Goal: To do nonlinear regression, in three ways
# By just supplying the function to be fit,
# By also supplying the analytical derivatives, and
# By having him analytically differentiate the function to be fit.
#
# John Fox has a book "An R and S+ companion to applied regression"
# (abbreviated CAR).
# An appendix associated with this book, titled
# "Nonlinear regression and NLS"
# is up on the web, and I strongly recommend that you go read it.
#
# This file is essentially from there (I have made slight changes).
# First take some data - from the CAR book --
library(car)
data(US.pop)
attach(US.pop)
plot(year, population, type="l", col="blue")
# So you see, we have a time-series of the US population. We want to
# fit a nonlinear model to it.
library(stats) # Contains nonlinear regression
time <- 0:20
pop.mod <- nls(population ~ beta1/(1 + exp(beta2 + beta3*time)),
start=list(beta1=350, beta2=4.5, beta3=-0.3), trace=TRUE)
# You just write in the formula that you want to fit, and supply
# starting values. "trace=TRUE" makes him show iterations go by.
summary(pop.mod)
# Add in predicted values into the plot
lines(year, fitted.values(pop.mod), lwd=3, col="red")
# Look at residuals
plot(year, residuals(pop.mod), type="b")
abline(h=0, lty=2)
# Using analytical derivatives:
model <- function(beta1, beta2, beta3, time) {
m <- beta1/(1+exp(beta2+beta3*time))
term <- exp(beta2 + beta3*time)
gradient <- cbind((1+term)^-1,
-beta1*(1+term)^-2 * term,
-beta1*(1+term)^-2 * term * time)
attr(m, 'gradient') <- gradient
return(m)
}
summary(nls(population ~ model(beta1, beta2, beta3, time),
start=list(beta1=350, beta2=4.5, beta3=-0.3)))
# Using analytical derivatives, using automatic differentiation (!!!):
model <- deriv(~ beta1/(1 + exp(beta2+beta3*time)), # rhs of model
c('beta1', 'beta2', 'beta3'), # parameter names
function(beta1, beta2, beta3, time){} # arguments for result
)
summary(nls(population ~ model(beta1, beta2, beta3, time),
start=list(beta1=350, beta2=4.5, beta3=-0.3)))

@ -0,0 +1,18 @@
# Goal: To do sorting.
#
# The approach here needs to be explained. If `i' is a vector of
# integers, then the data frame D[i,] picks up rows from D based
# on the values found in `i'.
#
# The order() function makes an integer vector which is a correct
# ordering for the purpose of sorting.
D <- data.frame(x=c(1,2,3,1), y=c(7,19,2,2))
D
# Sort on x
indexes <- order(D$x)
D[indexes,]
# Print out sorted dataset, sorted in reverse by y
D[rev(order(D$y)),]

@ -0,0 +1,35 @@
# Goal: To make a latex table with results of an OLS regression.
# Get an OLS --
x1 = runif(100)
x2 = runif(100, 0, 2)
y = 2 + 3*x1 + 4*x2 + rnorm(100)
m = lm(y ~ x1 + x2)
# and print it out prettily --
library(xtable)
# Bare --
xtable(m)
xtable(anova(m))
# Better --
print.xtable(xtable(m, caption="My regression",
label="t:mymodel",
digits=c(0,3,2,2,3)),
type="latex",
file="xtable_demo_ols.tex",
table.placement = "tp",
latex.environments=c("center", "footnotesize"))
print.xtable(xtable(anova(m),
caption="ANOVA of my regression",
label="t:anova_mymodel"),
type="latex",
file="xtable_demo_anova.tex",
table.placement = "tp",
latex.environments=c("center", "footnotesize"))
# Read the documentation of xtable. It actually knows how to generate
# pretty latex tables for a lot more R objects than just OLS results.
# It can be a workhorse for making tabular out of matrices, and
# can also generate HTML.

@ -0,0 +1,33 @@
# Goal: To make a panel of pictures.
par(mfrow=c(3,2)) # 3 rows, 2 columns.
# Now the next 6 pictures will be placed on these 6 regions. :-)
# Let me take some pains on the 1st
plot(density(runif(100)), lwd=2)
text(x=0, y=0.2, "100 uniforms") # Showing you how to place text at will
abline(h=0, v=0)
# All these statements effect the 1st plot.
x=seq(0.01,1,0.01)
par(col="blue") # default colour to blue.
# 2 --
plot(x, sin(x), type="l")
lines(x, cos(x), type="l", col="red")
# 3 --
plot(x, exp(x), type="l", col="green")
lines(x, log(x), type="l", col="orange")
# 4 --
plot(x, tan(x), type="l", lwd=3, col="yellow")
# 5 --
plot(x, exp(-x), lwd=2)
lines(x, exp(x), col="green", lwd=3)
# 6 --
plot(x, sin(x*x), type="l")
lines(x, sin(1/x), col="pink")

@ -0,0 +1,41 @@
# Goal: To make latex tabular out of an R matrix
# Setup a nice R object:
m <- matrix(rnorm(8), nrow=2)
rownames(m) <- c("Age", "Weight")
colnames(m) <- c("Person1", "Person2", "Person3", "Person4")
m
# Translate it into a latex tabular:
library(xtable)
xtable(m, digits=rep(3,5))
# Production latex code that goes into a paper or a book --
print(xtable(m,
caption="String",
label="t:"),
type="latex",
file="blah.gen",
table.placement="tp",
latex.environments=c("center", "footnotesize"))
# Now you do \input{blah.gen} in your latex file.
# You're lazy, and want to use R to generate latex tables for you?
data <- cbind(
c(7,9,11,2),
c(2,4,19,21)
)
colnames(data) <- c("a","b")
rownames(data) <- c("x","y","z","a")
xtable(data)
# or you could do
data <- rbind(
c(7,2),
c(9,4),
c(11,19),
c(2,21)
)
# and the rest goes through identically.

@ -0,0 +1,24 @@
# Goal: To read in a simple data file where date data is present.
# Suppose you have a file "x.data" which looks like this:
# 1997-07-04,3.1,4
# 1997-07-05,7.2,19
# 1997-07-07,1.7,2
# 1997-07-08,1.1,13
A <- read.table("x.data", sep=",",
col.names=c("date", "my1", "my2"))
A$date <- as.Date(A$date, format="%Y-%m-%d")
# Say ?strptime to learn how to use "%" to specify
# other date formats. Two examples --
# "15/12/2002" needs "%d/%m/%Y"
# "03 Jun 1997" needs "%d %b %Y"
# Actually, if you're using the ISO 8601 date format, i.e.
# "%Y-%m-%d", that's the default setting and you don't need to
# specify the format.
A$newcol <- A$my1 + A$my2 # Makes a new column in A
newvar <- A$my1 - A$my2 # Makes a new R object "newvar"
A$my1 <- NULL # Delete the `my1' column
summary(A) # Makes summary statistics

@ -0,0 +1,26 @@
# Goal: To read in a simple data file, and look around it's contents.
# Suppose you have a file "x.data" which looks like this:
# 1997,3.1,4
# 1998,7.2,19
# 1999,1.7,2
# 2000,1.1,13
# To read it in --
A <- read.table("x.data", sep=",",
col.names=c("year", "my1", "my2"))
nrow(A) # Count the rows in A
summary(A$year) # The column "year" in data frame A
# is accessed as A$year
A$newcol <- A$my1 + A$my2 # Makes a new column in A
newvar <- A$my1 - A$my2 # Makes a new R object "newvar"
A$my1 <- NULL # Removes the column "my1"
# You might find these useful, to "look around" a dataset --
str(A)
summary(A)
library(Hmisc) # This requires that you've installed the Hmisc package
contents(A)
describe(A)

@ -0,0 +1,43 @@
# Goal: To show amazing R indexing notation, and the use of is.na()
x <- c(2,7,9,2,NA,5) # An example vector to play with.
# Give me elems 1 to 3 --
x[1:3]
# Give me all but elem 1 --
x[-1]
# Odd numbered elements --
indexes <- seq(1,6,2)
x[indexes]
# or, more compactly,
x[seq(1,6,2)]
# Access elements by specifying "on" / "off" through booleans --
require <- c(TRUE,TRUE,FALSE,FALSE,FALSE,FALSE)
x[require]
# Short vectors get reused! So, to get odd numbered elems --
x[c(TRUE,FALSE)]
# Locate missing data --
is.na(x)
# Replace missing data by 0 --
x[is.na(x)] <- 0
x
# Similar ideas work for matrices --
y <- matrix(c(2,7,9,2,NA,5), nrow=2)
y
# Make a matrix containing columns 1 and 3 --
y[,c(1,3)]
# Let us see what is.na(y) does --
is.na(y)
str(is.na(y))
# So is.na(y) gives back a matrix with the identical structure as that of y.
# Hence I can say
y[is.na(y)] <- -1
y

@ -0,0 +1,42 @@
# Goals: To write functions
# To write functions that send back multiple objects.
# FIRST LEARN ABOUT LISTS --
X = list(height=5.4, weight=54)
print("Use default printing --")
print(X)
print("Accessing individual elements --")
cat("Your height is ", X$height, " and your weight is ", X$weight, "\n")
# FUNCTIONS --
square <- function(x) {
return(x*x)
}
cat("The square of 3 is ", square(3), "\n")
# default value of the arg is set to 5.
cube <- function(x=5) {
return(x*x*x);
}
cat("Calling cube with 2 : ", cube(2), "\n") # will give 2^3
cat("Calling cube : ", cube(), "\n") # will default to 5^3.
# LEARN ABOUT FUNCTIONS THAT RETURN MULTIPLE OBJECTS --
powers <- function(x) {
parcel = list(x2=x*x, x3=x*x*x, x4=x*x*x*x);
return(parcel);
}
X = powers(3);
print("Showing powers of 3 --"); print(X);
# WRITING THIS COMPACTLY (4 lines instead of 7)
powerful <- function(x) {
return(list(x2=x*x, x3=x*x*x, x4=x*x*x*x));
}
print("Showing powers of 3 --"); print(powerful(3));
# In R, the last expression in a function is, by default, what is
# returned. So you could equally just say:
powerful <- function(x) {list(x2=x*x, x3=x*x*x, x4=x*x*x*x)}

@ -0,0 +1,44 @@
# Goal: Given two vectors of data,
# superpose their CDFs
# and show the results of the two-sample Kolmogorov-Smirnoff test
# The function consumes two vectors x1 and x2.
# You have to provide a pair of labels as `legendstrings'.
# If you supply an xlab, it's used
# If you specify log - e.g. log="x" - this is passed on to plot.
# The remaining args that you specify are sent on into ks.test()
two.cdfs.plot <- function(x1, x2, legendstrings, xlab="", log="", ...) {
stopifnot(length(x1)>0,
length(x2)>0,
length(legendstrings)==2)
hilo <- range(c(x1,x2))
par(mai=c(.8,.8,.2,.2))
plot(ecdf(x1), xlim=hilo, verticals=TRUE, cex=0,
xlab=xlab, log=log, ylab="Cum. distribution", main="")
grid()
plot(ecdf(x2), add=TRUE, verticals=TRUE, cex=0, lwd=3)
legend(x="bottomright", lwd=c(1,3), lty=1, bty="n",
legend=legendstrings)
k <- ks.test(x1,x2, ...)
text(x=hilo[1], y=c(.9,.85), pos=4, cex=.8,
labels=c(
paste("KS test statistic: ", sprintf("%.3g", k$statistic)),
paste("Prob value: ", sprintf("%.3g", k$p.value))
)
)
k
}
x1 <- rnorm(100, mean=7, sd=1)
x2 <- rnorm(100, mean=9, sd=1)
# Check error detection --
two.cdfs.plot(x1,x2)
# Typical use --
two.cdfs.plot(x1, x2, c("X1","X2"), xlab="Height (metres)", log="x")
# Send args into ks.test() --
two.cdfs.plot(x1, x2, c("X1","X2"), alternative="less")

@ -0,0 +1,61 @@
# Goal: Experiment with fitting nonlinear functional forms in
# OLS, using orthogonal polynomials to avoid difficulties with
# near-singular design matrices that occur with ordinary polynomials.
# Shriya Anand, Gabor Grothendieck, Ajay Shah, March 2006.
# We will deal with noisy data from the d.g.p. y = sin(x) + e
x <- seq(0, 2*pi, length.out=50)
set.seed(101)
y <- sin(x) + 0.3*rnorm(50)
basicplot <- function(x, y, minx=0, maxx=3*pi, title="") {
plot(x, y, xlim=c(minx,maxx), ylim=c(-2,2), main=title)
lines(x, sin(x), col="blue", lty=2, lwd=2)
abline(h=0, v=0)
}
x.outsample <- seq(0, 3*pi, length.out=100)
# Severe multicollinearity with ordinary polynomials
x2 <- x*x
x3 <- x2*x
x4 <- x3*x
cor(cbind(x, x2, x3, x4))
# and a perfect design matrix using orthogonal polynomials
m <- poly(x, 4)
all.equal(cor(m), diag(4)) # Correlation matrix is I.
par(mfrow=c(2,2))
# Ordinary polynomial regression --
p <- lm(y ~ x + I(x^2) + I(x^3) + I(x^4))
summary(p)
basicplot(x, y, title="Polynomial, insample") # Data
lines(x, fitted(p), col="red", lwd=3) # In-sample
basicplot(x, y, title="Polynomial, out-of-sample")
predictions.p <- predict(p, list(x = x.outsample)) # Out-of-sample
lines(x.outsample, predictions.p, type="l", col="red", lwd=3)
lines(x.outsample, sin(x.outsample), type="l", col="blue", lwd=2, lty=2)
# As expected, polynomial fitting gives terrible results out of sample.
# These IDENTICAL things using orthogonal polynomials
d <- lm(y ~ poly(x, 4))
summary(d)
basicplot(x, y, title="Orth. poly., insample") # Data
lines(x, fitted(d), col="red", lwd=3) # In-sample
basicplot(x, y, title="Orth. poly., out-of-sample")
predictions.op <- predict(d, list(x = x.outsample)) # Out-of-sample
lines(x.outsample, predictions.op, type="l", col="red", lwd=3)
lines(x.outsample, sin(x.outsample), type="l", col="blue", lwd=2, lty=2)
# predict(d) is magical! See ?SafePrediction
# The story runs at two levels. First, when you do an OLS model,
# predict()ion requires applying coefficients to an appropriate
# X matrix. But one level deeper, the polynomial or orthogonal-polynomial
# needs to be utilised for computing the X matrix based on the
# supplied x.outsample data.
# If you say p <- poly(x, n)
# then you can say predict(p, new) where predict.poly() gets invoked.
# And when you say predict(lm()), the full steps are worked out for
# you automatically: predict.poly() is used to make an X matrix and
# then prediction based on the regression results is done.
all.equal(predictions.p, predictions.op) # Both paths are identical for this
# (tame) problem.

@ -0,0 +1,26 @@
# Goal: Utilise matrix notation
# We use the problems of portfolio analysis as an example.
# Prices of 4 firms to play with, at weekly frequency (for calendar 2004) --
p <- structure(c(300.403, 294.604, 291.038, 283.805, 270.773, 275.506, 292.271, 292.837, 284.872, 295.037, 280.939, 259.574, 250.608, 268.84, 266.507, 263.94, 273.173, 238.609, 230.677, 192.847, 219.078, 201.846, 210.279, 193.281, 186.748, 197.314, 202.813, 204.08, 226.044, 242.442, 261.274, 269.173, 256.05, 259.75, 243, 250.3, 263.45, 279.5, 289.55, 291.95, 302.1, 284.4, 283.5, 287.8, 298.3, 307.6, 307.65, 311.9, 327.7, 318.1, 333.6, 358.9, 385.1, 53.6, 51.95, 47.65, 44.8, 44.85, 44.3, 47.1, 44.2, 41.8, 41.9, 41, 35.3, 33.35, 35.6, 34.55, 35.55, 40.05, 35, 34.85, 28.95, 31, 29.25, 29.05, 28.95, 24.95, 26.15, 28.35, 29.4, 32.55, 37.2, 39.85, 40.8, 38.2, 40.35, 37.55, 39.4, 39.8, 43.25, 44.75, 47.25, 49.6, 47.6, 46.35, 49.4, 49.5, 50.05, 50.5, 51.85, 56.35, 54.15, 58, 60.7, 62.7, 293.687, 292.746, 283.222, 286.63, 259.774, 259.257, 270.898, 250.625, 242.401, 248.1, 244.942, 239.384, 237.926, 224.886, 243.959, 270.998, 265.557, 257.508, 258.266, 257.574, 251.917, 250.583, 250.783, 246.6, 252.475, 266.625, 263.85, 249.925, 262.9, 264.975, 273.425, 275.575, 267.2, 282.25, 284.25, 290.75, 295.625, 296.25, 291.375, 302.225, 318.95, 324.825, 320.55, 328.75, 344.05, 345.925, 356.5, 368.275, 374.825, 373.525, 378.325, 378.6, 374.4, 1416.7, 1455.15, 1380.97, 1365.31, 1303.2, 1389.64, 1344.05, 1266.29, 1265.61, 1312.17, 1259.25, 1297.3, 1327.38, 1250, 1328.03, 1347.46, 1326.79, 1286.54, 1304.84, 1272.44, 1227.53, 1264.44, 1304.34, 1277.65, 1316.12, 1370.97, 1423.35, 1382.5, 1477.75, 1455.15, 1553.5, 1526.8, 1479.85, 1546.8, 1565.3, 1606.6, 1654.05, 1689.7, 1613.95, 1703.25, 1708.05, 1786.75, 1779.75, 1906.35, 1976.6, 2027.2, 2057.85, 2029.6, 2051.35, 2033.4, 2089.1, 2065.2, 2091.7), .Dim = c(53, 4), .Dimnames = list(NULL, c("TISCO", "SAIL", "Wipro", "Infosys")))
# Shift from prices to returns --
r <- 100*diff(log(p))
# Historical expected returns --
colMeans(r)
# Historical correlation matrix --
cor(r)
# Historical covariance matrix --
S <- cov(r)
S
# Historical portfolio variance for a stated portfolio of 20%,20%,30%,30% --
w <- c(.2, .2, .3, .3)
t(w) %*% S %*% w
# The portfolio optimisation function in tseries --
library(tseries)
optimised <- portfolio.optim(r) # This uses the historical facts from r
optimised$pw # Weights
optimised$pm # Expected return using these weights
optimised$ps # Standard deviation of optimised port.

@ -0,0 +1,17 @@
> sum(2,7,5)
[1] 14
> x
[1] 2 NA 3 1 4
> sum(x) # if any element is NA or NaN, result is NA or NaN
[1] NA
> sum(x, na.rm=TRUE) # this way we can ignore NA and NaN values
[1] 10
> mean(x, na.rm=TRUE)
[1] 2.5
> prod(x, na.rm=TRUE)
[1] 24

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save