Adding R programming language

This commit is contained in:
Michael Reber 2019-11-18 14:03:28 +01:00
parent 599b63599b
commit 026079a47d
70 changed files with 2257 additions and 0 deletions

View File

@ -0,0 +1,31 @@
# Goals: A first look at R objects - vectors, lists, matrices, data frames.
# To make vectors "x" "y" "year" and "names"
x <- c(2,3,7,9)
y <- c(9,7,3,2)
year <- 1990:1993
names <- c("payal", "shraddha", "kritika", "itida")
# Accessing the 1st and last elements of y --
y[1]
y[length(y)]
# To make a list "person" --
person <- list(name="payal", x=2, y=9, year=1990)
person
# Accessing things inside a list --
person$name
person$x
# To make a matrix, pasting together the columns "year" "x" and "y"
# The verb cbind() stands for "column bind"
cbind(year, x, y)
# To make a "data frame", which is a list of vectors of the same length --
D <- data.frame(names, year, x, y)
nrow(D)
# Accessing one of these vectors
D$names
# Accessing the last element of this vector
D$names[nrow(D)]
# Or equally,
D$names[length(D$names)]

View File

@ -0,0 +1,11 @@
# Goal: A histogram with tails shown in red.
# This happened on the R mailing list on 7 May 2004.
# This is by Martin Maechler <maechler@stat.math.ethz.ch>, who was
# responding to a slightly imperfect version of this by
# "Guazzetti Stefano" <Stefano.Guazzetti@ausl.re.it>
x <- rnorm(1000)
hx <- hist(x, breaks=100, plot=FALSE)
plot(hx, col=ifelse(abs(hx$breaks) < 1.669, 4, 2))
# What is cool is that "col" is supplied a vector.

View File

@ -0,0 +1,67 @@
# Goals: ARMA modeling - estimation, diagnostics, forecasting.
# 0. SETUP DATA
rawdata <- c(-0.21,-2.28,-2.71,2.26,-1.11,1.71,2.63,-0.45,-0.11,4.79,5.07,-2.24,6.46,3.82,4.29,-1.47,2.69,7.95,4.46,7.28,3.43,-3.19,-3.14,-1.25,-0.50,2.25,2.77,6.72,9.17,3.73,6.72,6.04,10.62,9.89,8.23,5.37,-0.10,1.40,1.60,3.40,3.80,3.60,4.90,9.60,18.20,20.60,15.20,27.00,15.42,13.31,11.22,12.77,12.43,15.83,11.44,12.32,12.10,12.02,14.41,13.54,11.36,12.97,10.00,7.20,8.74,3.92,8.73,2.19,3.85,1.48,2.28,2.98,4.21,3.85,6.52,8.16,5.36,8.58,7.00,10.57,7.12,7.95,7.05,3.84,4.93,4.30,5.44,3.77,4.71,3.18,0.00,5.25,4.27,5.14,3.53,4.54,4.70,7.40,4.80,6.20,7.29,7.30,8.38,3.83,8.07,4.88,8.17,8.25,6.46,5.96,5.88,5.03,4.99,5.87,6.78,7.43,3.61,4.29,2.97,2.35,2.49,1.56,2.65,2.49,2.85,1.89,3.05,2.27,2.91,3.94,2.34,3.14,4.11,4.12,4.53,7.11,6.17,6.25,7.03,4.13,6.15,6.73,6.99,5.86,4.19,6.38,6.68,6.58,5.75,7.51,6.22,8.22,7.45,8.00,8.29,8.05,8.91,6.83,7.33,8.52,8.62,9.80,10.63,7.70,8.91,7.50,5.88,9.82,8.44,10.92,11.67)
# Make a R timeseries out of the rawdata: specify frequency & startdate
gIIP <- ts(rawdata, frequency=12, start=c(1991,4))
print(gIIP)
plot.ts(gIIP, type="l", col="blue", ylab="IIP Growth (%)", lwd=2,
main="Full data")
grid()
# Based on this, I decide that 4/1995 is the start of the sensible period.
gIIP <- window(gIIP, start=c(1995,4))
print(gIIP)
plot.ts(gIIP, type="l", col="blue", ylab="IIP Growth (%)", lwd=2,
main="Estimation subset")
grid()
# Descriptive statistics about gIIP
mean(gIIP); sd(gIIP); summary(gIIP);
plot(density(gIIP), col="blue", main="(Unconditional) Density of IIP growth")
acf(gIIP)
# 1. ARMA ESTIMATION
m.ar2 <- arima(gIIP, order = c(2,0,0))
print(m.ar2) # Print it out
# 2. ARMA DIAGNOSTICS
tsdiag(m.ar2) # His pretty picture of diagnostics
## Time series structure in errors
print(Box.test(m.ar2$residuals, lag=12, type="Ljung-Box"));
## Sniff for ARCH
print(Box.test(m.ar2$residuals^2, lag=12, type="Ljung-Box"));
## Eyeball distribution of residuals
plot(density(m.ar2$residuals), col="blue", xlim=c(-8,8),
main=paste("Residuals of AR(2)"))
# 3. FORECASTING
## Make a picture of the residuals
plot.ts(m.ar2$residual, ylab="Innovations", col="blue", lwd=2)
s <- sqrt(m.ar2$sigma2)
abline(h=c(-s,s), lwd=2, col="lightGray")
p <- predict(m.ar2, n.ahead = 12) # Make 12 predictions.
print(p)
## Watch the forecastability decay away from fat values to 0.
## sd(x) is the naive sigma. p$se is the prediction se.
gain <- 100*(1-p$se/sd(gIIP))
plot.ts(gain, main="Gain in forecast s.d.", ylab="Per cent",
col="blue", lwd=2)
## Make a pretty picture that puts it all together
ts.plot(gIIP, p$pred, p$pred-1.96*p$se, p$pred+1.96*p$se,
gpars=list(lty=c(1,1,2,2), lwd=c(2,2,1,1),
ylab="IIP growth (%)", col=c("blue","red", "red", "red")))
grid()
abline(h=mean(gIIP), lty=2, lwd=2, col="lightGray")
legend(x="bottomleft", cex=0.8, bty="n",
lty=c(1,1,2,2), lwd=c(2,1,1,2),
col=c("blue", "red", "red", "lightGray"),
legend=c("IIP", "AR(2) forecasts", "95% C.I.", "Mean IIP growth"))

16
r/Add Two Vectors.r Normal file
View File

@ -0,0 +1,16 @@
> x
[1] 3 6 8
> y
[1] 2 9 0
> x + y
[1] 5 15 8
> x + 1 # 1 is recycled to (1,1,1)
[1] 4 7 9
> x + c(1,4) # (1,4) is recycled to (1,4,1) but warning issued
[1] 4 10 9
Warning message:
In x + c(1, 4) :
longer object length is not a multiple of shorter object length

View File

@ -0,0 +1,33 @@
# Goal: All manner of import and export of datasets.
# Invent a dataset --
A <- data.frame(
name=c("a","b","c"),
ownership=c("Case 1","Case 1","Case 2"),
listed.at=c("NSE",NA,"BSE"),
# Firm "b" is unlisted.
is.listed=c(TRUE,FALSE,TRUE),
# R convention - boolean variables are named "is.something"
x=c(2.2,3.3,4.4),
date=as.Date(c("2004-04-04","2005-05-05","2006-06-06"))
)
# To a spreadsheet through a CSV file --
write.table(A,file="demo.csv",sep = ",",col.names = NA,qmethod = "double")
B <- read.table("demo.csv", header = TRUE, sep = ",", row.names = 1)
# To R as a binary file --
save(A, file="demo.rda")
load("demo.rda")
# To the Open XML standard for transport for statistical data --
library(StatDataML)
writeSDML(A, "/tmp/demo.sdml")
B <- readSDML("/tmp/demo.sdml")
# To Stata --
library(foreign)
write.dta(A, "/tmp/demo.dta")
B <- read.dta("/tmp/demo.dta")
# foreign::write.foreign() also has a pathway to SAS and SPSS.

View File

@ -0,0 +1,33 @@
# Goal: An example of simulation-based inference.
# This is in the context of testing for time-series dependence in
# stock market returns data.
# The code here does the idea of Kim, Nelson, Startz (1991).
# We want to use the distribution of realworld returns data, without
# needing assumptions about normality.
# The null is lack of dependence (i.e. an efficient market).
# So repeatedly, the data is permuted, and the sample ACF is computed.
# This gives us the distribution of the ACF under H0: independence, but
# while using the empirical distribution of the returns data.
# Weekly returns on Nifty, 1/1/2002 to 31/12/2003, 104 weeks of data.
r <- c(-0.70031182197603, 0.421690133064168, -1.20098072984689, 0.143402360644984, 3.81836537549516, 3.17055939373247, 0.305580301919228, 1.23853814691852, 0.81584795095706, -1.51865139747764, -2.71223626421522, -0.784836480094242, 1.09180041170998, 0.397649587762761, -4.11309534220923, -0.263912425099111, -0.0410144239805454, 1.75756212770972, -2.3335373897992, -2.19228764624217, -3.64578978183987, 1.92535789661354, 3.45782867883164, -2.15532607229374, -0.448039988298987, 1.50124793565896, -1.45871585874362, -2.13459863369767, -6.2128068251802, -1.94482987066289, 0.751294815735637, 1.78244982829590, 1.61567494389745, 1.53557708728931, -1.53557708728931, -0.322061470004265, -2.28394919698225, 0.70399304137414, -2.93580952607737, 2.38125098034425, 0.0617697039252185, -4.14482733720716, 2.04397528093754, 0.576400673606603, 3.43072725191913, 2.96465382864843, 2.89833358015583, 1.85387040058336, 1.52136515035952, -0.637268376944444, 1.75418926224609, -0.804391905851354, -0.861816058320475, 0.576902488444109, -2.84259880663331, -1.35375536139417, 1.49096529042234, -2.05404881010045, 2.86868849528146, -0.258270670200478, -4.4515881438687, -1.73055019137092, 3.04427015714648, -2.94928202352018, 1.62081315773994, -6.83117945164824, -0.962715713711582, -1.75875847071740, 1.50330330252721, -0.0479705789653728, 3.68968303215933, -0.535807567290103, 3.94034871061182, 3.85787174417738, 0.932185956989873, 4.08598654183674, 2.27343783689715, 1.13958830440017, 2.01737201171230, -1.88131458327554, 1.97596267156648, 2.79857144562001, 2.22470306481695, 2.03212951411427, 4.95626853448883, 3.40400972901396, 3.03840139165246, -1.89863129741417, -3.70832135042951, 4.78478922155396, 4.3973589590097, 4.9667050392987, 2.99775078737081, -4.12349101552438, 3.25638269809945, 2.29683376253966, -2.64772825878214, -0.630835277076258, 4.72528848505451, 1.87368447333380, 3.17543946162564, 4.58174427843208, 3.23625985632168, 2.29777651227296)
# The 1st autocorrelation from the sample:
acf(r, 1, plot=FALSE)$acf[2]
# Obtain 1000 draws from the distribution of the 1st autocorrelation
# under the null of independence:
set.seed <- 101
simulated <- replicate(1000, acf(r[sample(1:104, replace=FALSE)], 1, plot=FALSE)$acf[2])
# At 95% --
quantile(simulated, probs=c(.025,.975))
# At 99% --
quantile(simulated, probs=c(.005,.995))
# So we can reject the null at 95% but not at 99%.
# A pretty picture.
plot(density(simulated), col="blue")
abline(v=0)
abline(v=quantile(simulated, probs=c(.025,.975)), lwd=2, col="purple")
abline(v=acf(r, 1, plot=FALSE)$acf[2], lty=2, lwd=4, col="yellow")

View File

@ -0,0 +1,52 @@
# Goal: Associative arrays (as in awk) or hashes (as in perl).
# Or, more generally, adventures in R addressing.
# Here's a plain R vector:
x <- c(2,3,7,9)
# But now I tag every elem with labels:
names(x) <- c("kal","sho","sad","aja")
# Associative array operations:
x["kal"] <- 12
# Pretty printing the entire associative array:
x
# This works for matrices too:
m <- matrix(runif(10), nrow=5)
rownames(m) <- c("violet","indigo","blue","green","yellow")
colnames(m) <- c("Asia","Africa")
# The full matrix --
m
# Or even better --
library(xtable)
xtable(m)
# Now address symbolically --
m[,"Africa"]
m["indigo",]
m["indigo","Africa"]
# The "in" operator, as in awk --
for (colour in c("yellow", "orange", "red")) {
if (colour %in% rownames(m)) {
cat("For Africa and ", colour, " we have ", m[colour, "Africa"], "\n")
} else {
cat("Colour ", colour, " does not exist in the hash.\n")
}
}
# This works for data frames also --
D <- data.frame(m)
D
# Look closely at what happened --
str(D) # The colours are the rownames(D).
# Operations --
D$Africa
D[,"Africa"]
D["yellow",]
# or
subset(D, rownames(D)=="yellow")
colnames(D) <- c("Antarctica","America")
D
D$America

16
r/Binary to Decimal.r Normal file
View File

@ -0,0 +1,16 @@
Binary to Decimal
# Program to convert decimal
# number into binary number
# using recursive function
convert_to_binary <- function(n) {
if(n > 1) {
convert_to_binary(as.integer(n/2))
}
cat(n %% 2)
}
Output
> convert_to_binary(52)
110100

View File

@ -0,0 +1,25 @@
Check Armstrong number
# take input from the user
num = as.integer(readline(prompt="Enter a number: "))
# initialize sum
sum = 0
# find the sum of the cube of each digit
temp = num
while(temp > 0) {
digit = temp %% 10
sum = sum + (digit ^ 3)
temp = floor(temp / 10)
}
# display the result
if(num == sum) {
print(paste(num, "is an Armstrong number"))
} else {
print(paste(num, "is not an Armstrong number"))
}
Output 1
Enter a number: 23
[1] "23 is not an Armstrong number"

27
r/Check Leap Year.r Normal file
View File

@ -0,0 +1,27 @@
Check Leap Year
# Program to check if
# the input year is
# a leap year or not
year = as.integer(readline(prompt="Enter a year: "))
if((year %% 4) == 0) {
if((year %% 100) == 0) {
if((year %% 400) == 0) {
print(paste(year,"is a leap year"))
} else {
print(paste(year,"is not a leap year"))
}
} else {
print(paste(year,"is a leap year"))
}
} else {
print(paste(year,"is not a leap year"))
}
Output 1
Enter a year: 1900
[1] "1900 is not a leap year"
Output 2
Enter a year: 2000
[1] "2000 is a leap year"

View File

@ -0,0 +1,21 @@
Check Odd and Even Number
# Program to check if
# the input number is odd or even.
# A number is even if division
# by 2 give a remainder of 0.
# If remainder is 1, it is odd.
num = as.integer(readline(prompt="Enter a number: "))
if((num %% 2) == 0) {
print(paste(num,"is Even"))
} else {
print(paste(num,"is Odd"))
}
Output 1
Enter a number: 89
[1] "89 is Odd"
Output 2
Enter a number: 0
[1] "0 is Even"

View File

@ -0,0 +1,24 @@
Check Positive, Negative or Zero
# In this program, we input a number
# check if the number is positive or
# negative or zero and display
# an appropriate message
num = as.double(readline(prompt="Enter a number: "))
if(num > 0) {
print("Positive number")
} else {
if(num == 0) {
print("Zero")
} else {
print("Negative number")
}
}
Output 1
Enter a number: -9.6
[1] "Negative number"
Output 2
Enter a number: 2
[1] "Positive number"

27
r/Check Prime Number.r Normal file
View File

@ -0,0 +1,27 @@
Check Prime Number
# Program to check if
# the input number is
# prime or not
# take input from the user
num = as.integer(readline(prompt="Enter a number: "))
flag = 0
# prime numbers are greater than 1
if(num > 1) {
# check for factors
flag = 1
for(i in 2:(num-1)) {
if ((num %% i) == 0) {
flag = 0
break
}
}
}
if(num == 2) flag = 1
if(flag == 1) {
print(paste(num,"is a prime number"))
} else {
print(paste(num,"is not a prime number"))
}

30
r/Compute LCM in R.r Normal file
View File

@ -0,0 +1,30 @@
Compute LCM in R
# Program to find the L.C.M. of two input number
lcm <- function(x, y) {
# choose the greater number
if(x > y) {
greater = x
} else {
greater = y
}
while(TRUE) {
if((greater %% x == 0) && (greater %% y == 0)) {
lcm = greater
break
}
greater = greater + 1
}
return(lcm)
}
# take input from the user
num1 = as.integer(readline(prompt = "Enter first number: "))
num2 = as.integer(readline(prompt = "Enter second number: "))
print(paste("The L.C.M. of", num1,"and", num2,"is", lcm(num1, num2)))
Output
Enter first number: 24
Enter second number: 25
[1] "The L.C.M. of 24 and 25 is 600"

View File

@ -0,0 +1,15 @@
# Goals: Do bootstrap inference, as an example, for a sample median.
library(boot)
samplemedian <- function(x, d) { # d is a vector of integer indexes
return(median(x[d])) # The genius is in the x[d] notation
}
data <- rnorm(50) # Generate a dataset with 50 obs
b <- boot(data, samplemedian, R=2000) # 2000 bootstrap replications
cat("Sample median has a sigma of ", sd(b$t[,1]), "\n")
plot(b)
# Make a 99% confidence interval
boot.ci(b, conf=0.99, type="basic")

44
r/Doing OLS.r Normal file
View File

@ -0,0 +1,44 @@
# Goal: Simulate a dataset from the OLS model and obtain
# obtain OLS estimates for it.
x <- runif(100, 0, 10) # 100 draws from U(0,10)
y <- 2 + 3*x + rnorm(100) # beta = [2, 3] and sigma = 1
# You want to just look at OLS results?
summary(lm(y ~ x))
# Suppose x and y were packed together in a data frame --
D <- data.frame(x,y)
summary(lm(y ~ x, D))
# Full and elaborate steps --
d <- lm(y ~ x)
# Learn about this object by saying ?lm and str(d)
# Compact model results --
print(d)
# Pretty graphics for regression diagnostics --
par(mfrow=c(2,2))
plot(d)
d <- summary(d)
# Detailed model results --
print(d)
# Learn about this object by saying ?summary.lm and by saying str(d)
cat("OLS gave slope of ", d$coefficients[2,1],
"and a error sigma of ", d$sigma, "\n")
## I need to drop down to a smaller dataset now --
x <- runif(10)
y <- 2 + 3*x + rnorm(10)
m <- lm(y ~ x)
# Now R supplies a wide range of generic functions which extract
# useful things out of the result of estimation of many kinds of models.
residuals(m)
fitted(m)
AIC(m)
AIC(m, k=log(10)) # SBC
vcov(m)
logLik(m)

View File

@ -0,0 +1,55 @@
# Goal: "Dummy variables" in regression.
# Suppose you have this data:
people = data.frame(
age = c(21,62,54,49,52,38),
education = c("college", "school", "none", "school", "college", "none"),
education.code = c( 2, 1, 0, 1, 2, 0 )
)
# Here people$education is a string categorical variable and
# people$education.code is the same thing, with a numerical coding system.
people
# Note the structure of the dataset --
str(people)
# The strings supplied for `education' have been treated (correctly) as
# a factor, but education.code is being treated as an integer and not as
# a factor.
# We want to do a dummy variable regression. Normally you would have:
# 1 Chosen college as the omitted category
# 2 Made a dummy for "none" named educationnone
# 3 Made a dummy for "school" named educationschool
# 4 Ran a regression like lm(age ~ educationnone + educationschool, people)
# But this is R. Things are cool:
lm(age ~ education, people)
# ! :-)
# When you feed him an explanatory variable like education, he does all
# these steps automatically. (He chose college as the omitted category).
# If you use an integer coding, then the obvious thing goes wrong --
lm(age ~ education.code, people)
# because he's thinking that education.code is an integer explanatory
# variable. So you need to:
lm(age ~ factor(education.code), people)
# (he choose a different omitted category)
# Alternatively, fix up the dataset --
people$education.code <- factor(people$education.code)
lm(age ~ education.code, people)
#
# Bottom line:
# Once the dataset has categorical variables correctly represented as factors, i.e. as
str(people)
# doing OLS in R induces automatic generation of dummy variables while leaving one out:
lm(age ~ education, people)
lm(age ~ education.code, people)
# But what if you want the X matrix?
m <- lm(age ~ education, people)
model.matrix(m)
# This is the design matrix that went into the regression m.

View File

@ -0,0 +1,38 @@
Fibonacci Sequence in R
# Program to diplay the Fibonacci
# sequence up to n-th term using
# recursive functions
recurse_fibonacci <- function(n) {
if(n <= 1) {
return(n)
} else {
return(recurse_fibonacci(n-1) + recurse_fibonacci(n-2))
}
}
# take input from the user
nterms = as.integer(readline(prompt="How many terms? "))
# check if the number of terms is valid
if(nterms <= 0) {
print("Plese enter a positive integer")
} else {
print("Fibonacci sequence:")
for(i in 0:(nterms-1)) {
print(recurse_fibonacci(i))
}
}
Output
How many terms? 9
[1] "Fibonacci sequence:"
[1] 0
[1] 1
[1] 1
[1] 2
[1] 3
[1] 5
[1] 8
[1] 13
[1] 21

View File

@ -0,0 +1,14 @@
Find Factorial of a number using recursion
recur_factorial <- function(n) {
if(n <= 1) {
return(1)
} else {
return(n * recur_factorial(n-1))
}
}
Output
> recur_factorial(5)
[1] 120

View File

@ -0,0 +1,33 @@
Find Minimum and Maximum
> x
[1] 5 8 3 9 2 7 4 6 10
> # find the minimum
> min(x)
[1] 2
> # find the maximum
> max(x)
[1] 10
> # find the range
> range(x)
[1] 2 10
If we want to find where the minimum or maximum is located, i.e. the index instead of the actual value, then we can use which.min() and which.max() functions.
Note that these functions will return the index of the first minimum or maximum in case multiple of them exists.
> x
[1] 5 8 3 9 2 7 4 6 10
> # find index of the minimum
> which.min(x)
[1] 5
> # find index of the minimum
> which.max(x)
[1] 9
> # alternate way to find the minimum
> x[which.min(x)]
[1] 2

View File

@ -0,0 +1,29 @@
Find factors of a number
print_factors <- function(x) {
print(paste("The factors of",x,"are:"))
for(i in 1:x) {
if((x %% i) == 0) {
print(i)
}
}
}
Output
> print_factors(120)
[1] "The factors of 120 are:"
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 8
[1] 10
[1] 12
[1] 15
[1] 20
[1] 24
[1] 30
[1] 40
[1] 60
[1] 120

View File

@ -0,0 +1,21 @@
Find sum of natural numbers without formula
# take input from the user
num = as.integer(readline(prompt = "Enter a number: "))
if(num < 0) {
print("Enter a positive number")
} else {
sum = 0
# use while loop to iterate until zero
while(num > 0) {
sum = sum + num
num = num - 1
}
print(paste("The sum is", sum))
}
Output
Enter a number: 10
[1] "The sum is 55"

View File

@ -0,0 +1,20 @@
#Find the factorial of a number
# take input from the user
num = as.integer(readline(prompt="Enter a number: "))
factorial = 1
# check is the number is negative, positive or zero
if(num < 0) {
print("Sorry, factorial does not exist for negative numbers")
} else if(num == 0) {
print("The factorial of 0 is 1")
} else {
for(i in 1:num) {
factorial = factorial * i
}
print(paste("The factorial of", num ,"is",factorial))
}
Output
Enter a number: 8
[1] "The factorial of 8 is 40320"

View File

@ -0,0 +1,8 @@
> runif(1) # generates 1 random number
[1] 0.3984754
> runif(3) # generates 3 random number
[1] 0.8090284 0.1797232 0.6803607
> runif(3, min=5, max=10) # define the range between 5 and 10
[1] 7.099781 8.355461 5.173133

22
r/Handling missing data.r Normal file
View File

@ -0,0 +1,22 @@
# Goal:
# A stock is traded on 2 exchanges.
# Price data is missing at random on both exchanges owing to non-trading.
# We want to make a single price time-series utilising information
# from both exchanges. I.e., missing data for exchange 1 will
# be replaced by information for exchange 2 (if observed).
# Let's create some example data for the problem.
e1 <- runif(15) # Prices on exchange 1
e2 <- e1 + 0.05*rnorm(15) # Prices on exchange 2.
cbind(e1, e2)
# Blow away 5 points from each at random.
e1[sample(1:15, 5)] <- NA
e2[sample(1:15, 5)] <- NA
cbind(e1, e2)
# Now how do we reconstruct a time-series that tries to utilise both?
combined <- e1 # Do use the more liquid exchange here.
missing <- is.na(combined)
combined[missing] <- e2[missing] # if it's also missing, I don't care.
cbind(e1, e2, combined)
# There you are.

View File

@ -0,0 +1,43 @@
# Goal: Joint distributions, marginal distributions, useful tables.
# First let me invent some fake data
set.seed(102) # This yields a good illustration.
x <- sample(1:3, 15, replace=TRUE)
education <- factor(x, labels=c("None", "School", "College"))
x <- sample(1:2, 15, replace=TRUE)
gender <- factor(x, labels=c("Male", "Female"))
age <- runif(15, min=20,max=60)
D <- data.frame(age, gender, education)
rm(x,age,gender,education)
print(D)
# Table about education
table(D$education)
# Table about education and gender --
table(D$gender, D$education)
# Joint distribution of education and gender --
table(D$gender, D$education)/nrow(D)
# Add in the marginal distributions also
addmargins(table(D$gender, D$education))
addmargins(table(D$gender, D$education))/nrow(D)
# Generate a good LaTeX table out of it --
library(xtable)
xtable(addmargins(table(D$gender, D$education))/nrow(D),
digits=c(0,2,2,2,2)) # You have to do | and \hline manually.
# Study age by education category
by(D$age, D$gender, mean)
by(D$age, D$gender, sd)
by(D$age, D$gender, summary)
# Two-way table showing average age depending on education & gender
a <- matrix(by(D$age, list(D$gender, D$education), mean), nrow=2)
rownames(a) <- levels(D$gender)
colnames(a) <- levels(D$education)
print(a)
# or, of course,
print(xtable(a))

View File

@ -0,0 +1,23 @@
# Goals: Simulate a dataset from a "fixed effects" model, and
# obtain "least squares dummy variable" (LSDV) estimates.
#
# We do this in the context of a familiar "earnings function" -
# log earnings is quadratic in log experience, with parallel shifts by
# education category.
# Create an education factor with 4 levels --
education <- factor(sample(1:4,1000, replace=TRUE),
labels=c("none", "school", "college", "beyond"))
# Simulate an experience variable with a plausible range --
experience <- 30*runif(1000) # experience from 0 to 20 years
# Make the intercept vary by education category between 4 given values --
intercept <- c(0.5,1,1.5,2)[education]
# Simulate the log earnings --
log.earnings <- intercept +
2*experience - 0.05*experience*experience + rnorm(1000)
A <- data.frame(education, experience, e2=experience*experience, log.earnings)
summary(A)
# The OLS path to LSDV --
summary(lm(log.earnings ~ -1 + education + experience + e2, A))

View File

@ -0,0 +1,31 @@
# Goal: Make a time-series object using the "zoo" package
A <- data.frame(date=c("1995-01-01", "1995-01-02", "1995-01-03", "1995-01-06"),
x=runif(4),
y=runif(4))
A$date <- as.Date(A$date) # yyyy-mm-dd is the default format
# So far there's nothing new - it's just a data frame. I have hand-
# constructed A but you could equally have obtained it using read.table().
# I want to make a zoo matrix out of the numerical columns of A
library(zoo)
B <- A
B$date <- NULL
z <- zoo(as.matrix(B), order.by=A$date)
rm(A, B)
# So now you are holding "z", a "zoo" object. You can do many cool
# things with it.
# See http://www.google.com/search?hl=en&q=zoo+quickref+achim&btnI=I%27m+Feeling+Lucky
# To drop down to a plain data matrix, say
C <- coredata(z)
rownames(C) <- as.character(time(z))
# Compare --
str(C)
str(z)
# The above is a tedious way of doing these things, designed to give you
# an insight into what is going on. If you just want to read a file
# into a zoo object, a very short path is something like:
# z <- read.zoo(filename, format="%d %b %Y")

View File

@ -0,0 +1,20 @@
# Goal: Make pictures in PDF files that can be put into a paper.
xpts <- seq(-3,3,.05)
# Here is my suggested setup for a two-column picture --
pdf("demo2.pdf", width=5.6, height=2.8, bg="cadetblue1", pointsize=8)
par(mai=c(.6,.6,.2,.2))
plot(xpts, sin(xpts*xpts), type="l", lwd=2, col="cadetblue4",
xlab="x", ylab="sin(x*x)")
grid(col="white", lty=1, lwd=.2)
abline(h=0, v=0)
# My suggested setup for a square one-column picture --
pdf("demo1.pdf", width=2.8, height=2.8, bg="cadetblue1", pointsize=8)
par(mai=c(.6,.6,.2,.2))
plot(xpts, sin(xpts*xpts), type="l", lwd=2, col="cadetblue4",
xlab="x", ylab="sin(x*x)")
grid(col="white", lty=1, lwd=.2)
abline(h=0, v=0)

25
r/Multiplication Table.r Normal file
View File

@ -0,0 +1,25 @@
Multiplication Table
# Program to find the multiplication
# table (from 1 to 10)
# of a number input by the user
# take input from the user
num = as.integer(readline(prompt = "Enter a number: "))
# use for loop to iterate 10 times
for(i in 1:10) {
print(paste(num,'x', i, '=', num*i))
}
Output
Enter a number: 7
[1] "7 x 1 = 7"
[1] "7 x 2 = 14"
[1] "7 x 3 = 21"
[1] "7 x 4 = 28"
[1] "7 x 5 = 35"
[1] "7 x 6 = 42"
[1] "7 x 7 = 49"
[1] "7 x 8 = 56"
[1] "7 x 9 = 63"
[1] "7 x 10 = 70"

View File

@ -0,0 +1,13 @@
# Goal: Display two series on one plot, one with a left y axis
# and another with a right y axis.
y1 <- cumsum(rnorm(100))
y2 <- cumsum(rnorm(100, mean=0.2))
par(mai=c(.8, .8, .2, .8))
plot(1:100, y1, type="l", col="blue", xlab="X axis label", ylab="Left legend")
par(new=TRUE)
plot(1:100, y2, type="l", ann=FALSE, yaxt="n")
axis(4)
legend(x="topleft", bty="n", lty=c(1,1), col=c("blue","black"),
legend=c("String 1 (left scale)", "String 2 (right scale)"))

99
r/Prices and returns.r Normal file
View File

@ -0,0 +1,99 @@
# Goal: Prices and returns
# I like to multiply returns by 100 so as to have "units in percent".
# In other words, I like it for 5% to be a value like 5 rather than 0.05.
###################################################################
# I. Simulate random-walk prices, switch between prices & returns.
###################################################################
# Simulate a time-series of PRICES drawn from a random walk
# where one-period returns are i.i.d. N(mu, sigma^2).
ranrw <- function(mu, sigma, p0=100, T=100) {
cumprod(c(p0, 1 + (rnorm(n=T, mean=mu, sd=sigma)/100)))
}
prices2returns <- function(x) {
100*diff(log(x))
}
returns2prices <- function(r, p0=100) {
c(p0, p0 * exp(cumsum(r/100)))
}
cat("Simulate 25 points from a random walk starting at 1500 --\n")
p <- ranrw(0.05, 1.4, p0=1500, T=25)
# gives you a 25-long series, starting with a price of 1500, where
# one-period returns are N(0.05,1.4^2) percent.
print(p)
cat("Convert to returns--\n")
r <- prices2returns(p)
print(r)
cat("Go back from returns to prices --\n")
goback <- returns2prices(r, 1500)
print(goback)
###################################################################
# II. Plenty of powerful things you can do with returns....
###################################################################
summary(r); sd(r) # summary statistics
plot(density(r)) # kernel density plot
acf(r) # Autocorrelation function
ar(r) # Estimate a AIC-minimising AR model
Box.test(r, lag=2, type="Ljung") # Box-Ljung test
library(tseries)
runs.test(factor(sign(r))) # Runs test
bds.test(r) # BDS test.
###################################################################
# III. Visualisation and the random walk
###################################################################
# I want to obtain intuition into what kinds of price series can happen,
# given a starting price, a mean return, and a given standard deviation.
# This function simulates out 10000 days of a price time-series at a time,
# and waits for you to click in the graph window, after which a second
# series is painted, and so on. Make the graph window very big and
# sit back and admire.
# The point is to eyeball many series and thus obtain some intuition
# into what the random walk does.
visualisation <- function(p0, s, mu, labelstring) {
N <- 10000
x <- (1:(N+1))/250 # Unit of years
while (1) {
plot(x, ranrw(mu, s, p0, N), ylab="Level", log="y",
type="l", col="red", xlab="Time (years)",
main=paste("40 years of a process much like", labelstring))
grid()
z=locator(1)
}
}
# Nifty -- assuming sigma of 1.4% a day and E(returns) of 13% a year
visualisation(2600, 1.4, 13/250, "Nifty")
# The numerical values here are used to think about what the INR/USD
# exchange rate would have looked like if it started from 31.37, had
# a mean depreciation of 5% per year, and had the daily vol of a floating
# exchange rate like EUR/USD.
visualisation(31.37, 0.7, 5/365, "INR/USD (NOT!) with daily sigma=0.7")
# This is of course not like the INR/USD series in the real world -
# which is neither a random walk nor does it have a vol of 0.7% a day.
# The numerical values here are used to think about what the USD/EUR
# exchange rate, starting with 1, having no drift, and having the observed
# daily vol of 0.7. (This is about right).
visualisation(1, 0.7, 0, "USD/EUR with no drift")
###################################################################
# IV. A monte carlo experiment about the runs test
###################################################################
# Measure the effectiveness of the runs test when faced with an
# AR(1) process of length 100 with a coeff of 0.1
set.seed(101)
one.ts <- function() {arima.sim(list(order = c(1,0,0), ar = 0.1), n=100)}
table(replicate(1000, runs.test(factor(sign(one.ts())))$p.value < 0.05))
# We find that the runs test throws up a prob value of below 0.05
# for 91 out of 1000 experiments.
# Wow! :-)
# To understand this, you need to look up the man pages of:
# set.seed, arima.sim, sign, factor, runs.test, replicate, table.
# e.g. say ?replicate

View File

@ -0,0 +1,41 @@
Print Fibonacci Sequence
# take input from the user
nterms = as.integer(readline(prompt="How many terms? "))
# first two terms
n1 = 0
n2 = 1
count = 2
# check if the number of terms is valid
if(nterms <= 0) {
print("Plese enter a positive integer")
} else {
if(nterms == 1) {
print("Fibonacci sequence:")
print(n1)
} else {
print("Fibonacci sequence:")
print(n1)
print(n2)
while(count < nterms) {
nth = n1 + n2
print(nth)
# update values
n1 = n2
n2 = nth
count = count + 1
}
}
}
Output
How many terms? 7
[1] "Fibonacci sequence:"
[1] 0
[1] 1
[1] 1
[1] 2
[1] 3
[1] 5
[1] 8

30
r/Program to Find GCD.r Normal file
View File

@ -0,0 +1,30 @@
Program to Find GCD
# Program to find the
# H.C.F of two input number
# define a function
hcf <- function(x, y) {
# choose the smaller number
if(x > y) {
smaller = y
} else {
smaller = x
}
for(i in 1:smaller) {
if((x %% i == 0) && (y %% i == 0)) {
hcf = i
}
}
return(hcf)
}
# take input from the user
num1 = as.integer(readline(prompt = "Enter first number: "))
num2 = as.integer(readline(prompt = "Enter second number: "))
print(paste("The H.C.F. of", num1,"and", num2,"is", hcf(num1, num2)))
Output
Enter first number: 72
Enter second number: 120
[1] "The H.C.F. of 72 and 120 is 24"

View File

@ -0,0 +1,52 @@
# Get the data in place --
load(file="demo.rda")
summary(firms)
# Look at it --
plot(density(log(firms$mktcap)))
plot(firms$mktcap, firms$spread, type="p", cex=.2, col="blue", log="xy",
xlab="Market cap (Mln USD)", ylab="Bid/offer spread (bps)")
m=lm(log(spread) ~ log(mktcap), firms)
summary(m)
# Making deciles --
library(gtools)
library(gdata)
# for deciles (default=quartiles)
size.category = quantcut(firms$mktcap, q=seq(0, 1, 0.1), labels=F)
table(size.category)
means = aggregate(firms, list(size.category), mean)
print(data.frame(means$mktcap,means$spread))
# Make a picture combining the sample mean of spread (in each decile)
# with the weighted average sample mean of the spread (in each decile),
# where weights are proportional to size.
wtd.means = by(firms, size.category,
function(piece) (sum(piece$mktcap*piece$spread)/sum(piece$mktcap)))
lines(means$mktcap, means$spread, type="b", lwd=2, col="green", pch=19)
lines(means$mktcap, wtd.means, type="b", lwd=2, col="red", pch=19)
legend(x=0.25, y=0.5, bty="n",
col=c("blue", "green", "red"),
lty=c(0, 1, 1), lwd=c(0,2,2),
pch=c(0,19,19),
legend=c("firm", "Mean spread in size deciles",
"Size weighted mean spread in size deciles"))
# Within group standard deviations --
aggregate(firms, list(size.category), sd)
# Now I do quartiles by BOTH mktcap and spread.
size.quartiles = quantcut(firms$mktcap, labels=F)
spread.quartiles = quantcut(firms$spread, labels=F)
table(size.quartiles, spread.quartiles)
# Re-express everything as joint probabilities
table(size.quartiles, spread.quartiles)/nrow(firms)
# Compute cell means at every point in the joint table:
aggregate(firms, list(size.quartiles, spread.quartiles), mean)
# Make pretty two-way tables
aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, nobs)
aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, mean)
aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, sd)
aggregate.table(firms$spread, size.quartiles, spread.quartiles, mean)
aggregate.table(firms$spread, size.quartiles, spread.quartiles, sd)

11
r/R Hello World Program.r Normal file
View File

@ -0,0 +1,11 @@
> # We can use the print() function
> print("Hello World!")
[1] "Hello World!"
> # Quotes can be suppressed in the output
> print("Hello World!", quote = FALSE)
[1] Hello World!
> # If there are more than 1 item, we can concatenate using paste()
> print(paste("How","are","you?"))
[1] "How are you?"

View File

@ -0,0 +1,12 @@
# Goal: R syntax where model specification is an argument to a function.
# Invent a dataset
x <- runif(100); y <- runif(100); z <- 2 + 3*x + 4*y + rnorm(100)
D <- data.frame(x=x, y=y, z=z)
amodel <- function(modelstring) {
summary(lm(modelstring, D))
}
amodel(z ~ x)
amodel(z ~ y)

3
r/README.md Normal file
View File

@ -0,0 +1,3 @@
# R programming language
R is a programming language designed for statistical computing and graphics purposes. Contains code that can be executed within the R software environment.

View File

@ -0,0 +1,43 @@
# Goal: Reading and writing ascii files, reading and writing binary files.
# And, to measure how much faster it is working with binary files.
# First manufacture a tall data frame:
# FYI -- runif(10) yields 10 U(0,1) random numbers.
B = data.frame(x1=runif(100000), x2=runif(100000), x3=runif(100000))
summary(B)
# Write out ascii file:
write.table(B, file = "/tmp/foo.csv", sep = ",", col.names = NA)
# Read in this resulting ascii file:
C=read.table("/tmp/foo.csv", header = TRUE, sep = ",", row.names=1)
# Write a binary file out of dataset C:
save(C, file="/tmp/foo.binary")
# Delete the dataset C:
rm(C)
# Restore from foo.binary:
load("/tmp/foo.binary")
summary(C) # should yield the same results
# as summary(B) above.
# Now we time all these operations --
cat("Time creation of dataset:\n")
system.time({
B = data.frame(x1=runif(100000), x2=runif(100000), x3=runif(100000))
})
cat("Time writing an ascii file out of dataset B:\n")
system.time(
write.table(B, file = "/tmp/foo.csv", sep = ",", col.names = NA)
)
cat("Time reading an ascii file into dataset C:\n")
system.time(
{C=read.table("/tmp/foo.csv", header = TRUE, sep=",", row.names=1)
})
cat("Time writing a binary file out of dataset C:\n")
system.time(save(C, file="/tmp/foo.binary"))
cat("Time reading a binary file + variablenames from /tmp/foo.binary:\n")
system.time(load("/tmp/foo.binary")) # and then read it in from binary file

View File

@ -0,0 +1,15 @@
# Goal: Reading in a Microsoft .xls file directly
library(gdata)
a <- read.xls("file.xls", sheet=2) # This reads in the 2nd sheet
# Look at what the cat dragged in
str(a)
# If you have a date column, you'll want to fix it up like this:
a$date <- as.Date(as.character(a$X), format="%d-%b-%y")
a$X <- NULL
# Also see http://tolstoy.newcastle.edu.au/R/help/06/04/25674.html for
# another path.

View File

@ -0,0 +1,17 @@
# Goal: To read in files produced by CMIE's "Business Beacon".
# This assumes you have made a file of MONTHLY data using CMIE's
# Business Beacon program. This contains 2 columns: M3 and M0.
A <- read.table(
# Generic to all BB files --
sep="|", # CMIE's .txt file is pipe delimited
skip=3, # Skip the 1st 3 lines
na.strings=c("N.A.","Err"), # The ways they encode missing data
# Specific to your immediate situation --
file="bb_data.text",
col.names=c("junk", "date", "M3", "M0")
)
A$junk <- NULL # Blow away this column
# Parse the CMIE-style "Mmm yy" date string that's used on monthly data
A$date <- as.Date(paste("1", as.character(A$date)), format="%d %b %Y")

View File

@ -0,0 +1,16 @@
> # sample with replacement
> sample(x, replace = TRUE)
[1] 15 17 13 9 5 15 11 15 1
> # if we simply pass in a positive number n, it will sample
> # from 1:n without replacement
> sample(10)
[1] 2 4 7 9 1 3 10 5 8 6
An example to simulate a coin toss for 10 times.
> sample(c("H","T"),10, replace = TRUE)
[1] "H" "H" "H" "T" "H" "T" "H" "H" "H" "T"

View File

@ -0,0 +1,40 @@
# Goals: Scare the hell out of children with the Cauchy distribution.
# A function which simulates N draws from one of two distributions,
# and returns the mean obtained thusly.
one.simulation <- function(N=100, distribution="normal") {
if (distribution == "normal") {
x <- rnorm(N)
} else {
x <- rcauchy(N)
}
mean(x)
}
k1 <- density(replicate(1000, one.simulation(20)))
k2 <- density(replicate(1000, one.simulation(20, distribution="cauchy")))
xrange <- range(k1$x, k2$x)
plot(k1$x, k1$y, xlim=xrange, type="l", xlab="Estimated value", ylab="")
grid()
lines(k2$x, k2$y, col="red")
abline(v=.5)
legend(x="topleft", bty="n",
lty=c(1,1),
col=c("black", "red"),
legend=c("Mean of Normal", "Mean of Cauchy"))
# The distribution of the mean of normals collapses into a point;
# that of the cauchy does not.
# Here's more scary stuff --
for (i in 1:10) {
cat("Sigma of distribution of 1000 draws from mean of normal - ",
sd(replicate(1000, one.simulation(20))), "\n")
}
for (i in 1:10) {
cat("Sigma of distribution of 1000 draws from mean of cauchy - ",
sd(replicate(1000, one.simulation(20, distribution="cauchy"))), "\n")
}
# Exercise for the reader: Compare the distribution of the median of
# the Normal against the distribution of the median of the Cauchy.

View File

@ -0,0 +1,33 @@
# Goals: Lots of times, you need to give an R object to a friend,
# or embed data into an email.
# First I invent a little dataset --
set.seed(101) # To make sure you get the same random numbers as me
# FYI -- runif(10) yields 10 U(0,1) random numbers.
A = data.frame(x1=runif(10), x2=runif(10), x3=runif(10))
# Look at it --
print(A)
# Writing to a binary file that can be transported
save(A, file="/tmp/my_data_file.rda") # You can give this file to a friend
load("/tmp/my_data_file.rda")
# Plan B - you want pure ascii, which can be put into an email --
dput(A)
# This gives you a block of R code. Let me utilise that generated code
# to create a dataset named "B".
B <- structure(list(x1 = c(0.372198376338929, 0.0438248154241592,