Adding R programming language
This commit is contained in:
parent
599b63599b
commit
026079a47d
@ 0,0 +1,31 @@


# Goals: A first look at R objects  vectors, lists, matrices, data frames.




# To make vectors "x" "y" "year" and "names"


x < c(2,3,7,9)


y < c(9,7,3,2)


year < 1990:1993


names < c("payal", "shraddha", "kritika", "itida")


# Accessing the 1st and last elements of y 


y[1]


y[length(y)]




# To make a list "person" 


person < list(name="payal", x=2, y=9, year=1990)


person


# Accessing things inside a list 


person$name


person$x




# To make a matrix, pasting together the columns "year" "x" and "y"


# The verb cbind() stands for "column bind"


cbind(year, x, y)




# To make a "data frame", which is a list of vectors of the same length 


D < data.frame(names, year, x, y)


nrow(D)


# Accessing one of these vectors


D$names


# Accessing the last element of this vector


D$names[nrow(D)]


# Or equally,


D$names[length(D$names)]

11
r/A histogram with tails shown in red.r
Normal file
11
r/A histogram with tails shown in red.r
Normal file
@ 0,0 +1,11 @@


# Goal: A histogram with tails shown in red.




# This happened on the R mailing list on 7 May 2004.


# This is by Martin Maechler <maechler@stat.math.ethz.ch>, who was


# responding to a slightly imperfect version of this by


# "Guazzetti Stefano" <Stefano.Guazzetti@ausl.re.it>




x < rnorm(1000)


hx < hist(x, breaks=100, plot=FALSE)


plot(hx, col=ifelse(abs(hx$breaks) < 1.669, 4, 2))


# What is cool is that "col" is supplied a vector.

67
r/ARMA modeling  estimation, diagnostics, forecasting.r
Normal file
67
r/ARMA modeling  estimation, diagnostics, forecasting.r
Normal file
@ 0,0 +1,67 @@


# Goals: ARMA modeling  estimation, diagnostics, forecasting.






# 0. SETUP DATA


rawdata < c(0.21,2.28,2.71,2.26,1.11,1.71,2.63,0.45,0.11,4.79,5.07,2.24,6.46,3.82,4.29,1.47,2.69,7.95,4.46,7.28,3.43,3.19,3.14,1.25,0.50,2.25,2.77,6.72,9.17,3.73,6.72,6.04,10.62,9.89,8.23,5.37,0.10,1.40,1.60,3.40,3.80,3.60,4.90,9.60,18.20,20.60,15.20,27.00,15.42,13.31,11.22,12.77,12.43,15.83,11.44,12.32,12.10,12.02,14.41,13.54,11.36,12.97,10.00,7.20,8.74,3.92,8.73,2.19,3.85,1.48,2.28,2.98,4.21,3.85,6.52,8.16,5.36,8.58,7.00,10.57,7.12,7.95,7.05,3.84,4.93,4.30,5.44,3.77,4.71,3.18,0.00,5.25,4.27,5.14,3.53,4.54,4.70,7.40,4.80,6.20,7.29,7.30,8.38,3.83,8.07,4.88,8.17,8.25,6.46,5.96,5.88,5.03,4.99,5.87,6.78,7.43,3.61,4.29,2.97,2.35,2.49,1.56,2.65,2.49,2.85,1.89,3.05,2.27,2.91,3.94,2.34,3.14,4.11,4.12,4.53,7.11,6.17,6.25,7.03,4.13,6.15,6.73,6.99,5.86,4.19,6.38,6.68,6.58,5.75,7.51,6.22,8.22,7.45,8.00,8.29,8.05,8.91,6.83,7.33,8.52,8.62,9.80,10.63,7.70,8.91,7.50,5.88,9.82,8.44,10.92,11.67)




# Make a R timeseries out of the rawdata: specify frequency & startdate


gIIP < ts(rawdata, frequency=12, start=c(1991,4))


print(gIIP)


plot.ts(gIIP, type="l", col="blue", ylab="IIP Growth (%)", lwd=2,


main="Full data")


grid()




# Based on this, I decide that 4/1995 is the start of the sensible period.


gIIP < window(gIIP, start=c(1995,4))


print(gIIP)


plot.ts(gIIP, type="l", col="blue", ylab="IIP Growth (%)", lwd=2,


main="Estimation subset")


grid()




# Descriptive statistics about gIIP


mean(gIIP); sd(gIIP); summary(gIIP);


plot(density(gIIP), col="blue", main="(Unconditional) Density of IIP growth")


acf(gIIP)






# 1. ARMA ESTIMATION


m.ar2 < arima(gIIP, order = c(2,0,0))


print(m.ar2) # Print it out






# 2. ARMA DIAGNOSTICS


tsdiag(m.ar2) # His pretty picture of diagnostics


## Time series structure in errors


print(Box.test(m.ar2$residuals, lag=12, type="LjungBox"));


## Sniff for ARCH


print(Box.test(m.ar2$residuals^2, lag=12, type="LjungBox"));


## Eyeball distribution of residuals


plot(density(m.ar2$residuals), col="blue", xlim=c(8,8),


main=paste("Residuals of AR(2)"))






# 3. FORECASTING


## Make a picture of the residuals


plot.ts(m.ar2$residual, ylab="Innovations", col="blue", lwd=2)


s < sqrt(m.ar2$sigma2)


abline(h=c(s,s), lwd=2, col="lightGray")




p < predict(m.ar2, n.ahead = 12) # Make 12 predictions.


print(p)




## Watch the forecastability decay away from fat values to 0.


## sd(x) is the naive sigma. p$se is the prediction se.


gain < 100*(1p$se/sd(gIIP))


plot.ts(gain, main="Gain in forecast s.d.", ylab="Per cent",


col="blue", lwd=2)




## Make a pretty picture that puts it all together


ts.plot(gIIP, p$pred, p$pred1.96*p$se, p$pred+1.96*p$se,


gpars=list(lty=c(1,1,2,2), lwd=c(2,2,1,1),


ylab="IIP growth (%)", col=c("blue","red", "red", "red")))


grid()


abline(h=mean(gIIP), lty=2, lwd=2, col="lightGray")


legend(x="bottomleft", cex=0.8, bty="n",


lty=c(1,1,2,2), lwd=c(2,1,1,2),


col=c("blue", "red", "red", "lightGray"),


legend=c("IIP", "AR(2) forecasts", "95% C.I.", "Mean IIP growth"))

16
r/Add Two Vectors.r
Normal file
16
r/Add Two Vectors.r
Normal file
@ 0,0 +1,16 @@


> x


[1] 3 6 8


> y


[1] 2 9 0




> x + y


[1] 5 15 8




> x + 1 # 1 is recycled to (1,1,1)


[1] 4 7 9




> x + c(1,4) # (1,4) is recycled to (1,4,1) but warning issued


[1] 4 10 9


Warning message:


In x + c(1, 4) :


longer object length is not a multiple of shorter object length

33
r/All manner of import and export of datasets.r
Normal file
33
r/All manner of import and export of datasets.r
Normal file
@ 0,0 +1,33 @@


# Goal: All manner of import and export of datasets.




# Invent a dataset 


A < data.frame(


name=c("a","b","c"),


ownership=c("Case 1","Case 1","Case 2"),


listed.at=c("NSE",NA,"BSE"),


# Firm "b" is unlisted.


is.listed=c(TRUE,FALSE,TRUE),


# R convention  boolean variables are named "is.something"


x=c(2.2,3.3,4.4),


date=as.Date(c("20040404","20050505","20060606"))


)




# To a spreadsheet through a CSV file 


write.table(A,file="demo.csv",sep = ",",col.names = NA,qmethod = "double")


B < read.table("demo.csv", header = TRUE, sep = ",", row.names = 1)




# To R as a binary file 


save(A, file="demo.rda")


load("demo.rda")




# To the Open XML standard for transport for statistical data 


library(StatDataML)


writeSDML(A, "/tmp/demo.sdml")


B < readSDML("/tmp/demo.sdml")




# To Stata 


library(foreign)


write.dta(A, "/tmp/demo.dta")


B < read.dta("/tmp/demo.dta")




# foreign::write.foreign() also has a pathway to SAS and SPSS.

33
r/An example of simulationbased inference.r
Normal file
33
r/An example of simulationbased inference.r
Normal file
@ 0,0 +1,33 @@


# Goal: An example of simulationbased inference.


# This is in the context of testing for timeseries dependence in


# stock market returns data.


# The code here does the idea of Kim, Nelson, Startz (1991).


# We want to use the distribution of realworld returns data, without


# needing assumptions about normality.


# The null is lack of dependence (i.e. an efficient market).


# So repeatedly, the data is permuted, and the sample ACF is computed.


# This gives us the distribution of the ACF under H0: independence, but


# while using the empirical distribution of the returns data.




# Weekly returns on Nifty, 1/1/2002 to 31/12/2003, 104 weeks of data.


r < c(0.70031182197603, 0.421690133064168, 1.20098072984689, 0.143402360644984, 3.81836537549516, 3.17055939373247, 0.305580301919228, 1.23853814691852, 0.81584795095706, 1.51865139747764, 2.71223626421522, 0.784836480094242, 1.09180041170998, 0.397649587762761, 4.11309534220923, 0.263912425099111, 0.0410144239805454, 1.75756212770972, 2.3335373897992, 2.19228764624217, 3.64578978183987, 1.92535789661354, 3.45782867883164, 2.15532607229374, 0.448039988298987, 1.50124793565896, 1.45871585874362, 2.13459863369767, 6.2128068251802, 1.94482987066289, 0.751294815735637, 1.78244982829590, 1.61567494389745, 1.53557708728931, 1.53557708728931, 0.322061470004265, 2.28394919698225, 0.70399304137414, 2.93580952607737, 2.38125098034425, 0.0617697039252185, 4.14482733720716, 2.04397528093754, 0.576400673606603, 3.43072725191913, 2.96465382864843, 2.89833358015583, 1.85387040058336, 1.52136515035952, 0.637268376944444, 1.75418926224609, 0.804391905851354, 0.861816058320475, 0.576902488444109, 2.84259880663331, 1.35375536139417, 1.49096529042234, 2.05404881010045, 2.86868849528146, 0.258270670200478, 4.4515881438687, 1.73055019137092, 3.04427015714648, 2.94928202352018, 1.62081315773994, 6.83117945164824, 0.962715713711582, 1.75875847071740, 1.50330330252721, 0.0479705789653728, 3.68968303215933, 0.535807567290103, 3.94034871061182, 3.85787174417738, 0.932185956989873, 4.08598654183674, 2.27343783689715, 1.13958830440017, 2.01737201171230, 1.88131458327554, 1.97596267156648, 2.79857144562001, 2.22470306481695, 2.03212951411427, 4.95626853448883, 3.40400972901396, 3.03840139165246, 1.89863129741417, 3.70832135042951, 4.78478922155396, 4.3973589590097, 4.9667050392987, 2.99775078737081, 4.12349101552438, 3.25638269809945, 2.29683376253966, 2.64772825878214, 0.630835277076258, 4.72528848505451, 1.87368447333380, 3.17543946162564, 4.58174427843208, 3.23625985632168, 2.29777651227296)




# The 1st autocorrelation from the sample:


acf(r, 1, plot=FALSE)$acf[2]




# Obtain 1000 draws from the distribution of the 1st autocorrelation


# under the null of independence:


set.seed < 101


simulated < replicate(1000, acf(r[sample(1:104, replace=FALSE)], 1, plot=FALSE)$acf[2])


# At 95% 


quantile(simulated, probs=c(.025,.975))


# At 99% 


quantile(simulated, probs=c(.005,.995))




# So we can reject the null at 95% but not at 99%.




# A pretty picture.


plot(density(simulated), col="blue")


abline(v=0)


abline(v=quantile(simulated, probs=c(.025,.975)), lwd=2, col="purple")


abline(v=acf(r, 1, plot=FALSE)$acf[2], lty=2, lwd=4, col="yellow")

52
r/Associative arrays (as in awk) or hashes (as in perl).r
Normal file
52
r/Associative arrays (as in awk) or hashes (as in perl).r
Normal file
@ 0,0 +1,52 @@


# Goal: Associative arrays (as in awk) or hashes (as in perl).


# Or, more generally, adventures in R addressing.




# Here's a plain R vector:


x < c(2,3,7,9)


# But now I tag every elem with labels:


names(x) < c("kal","sho","sad","aja")


# Associative array operations:


x["kal"] < 12


# Pretty printing the entire associative array:


x




# This works for matrices too:


m < matrix(runif(10), nrow=5)


rownames(m) < c("violet","indigo","blue","green","yellow")


colnames(m) < c("Asia","Africa")


# The full matrix 


m


# Or even better 


library(xtable)


xtable(m)




# Now address symbolically 


m[,"Africa"]


m["indigo",]


m["indigo","Africa"]




# The "in" operator, as in awk 


for (colour in c("yellow", "orange", "red")) {


if (colour %in% rownames(m)) {


cat("For Africa and ", colour, " we have ", m[colour, "Africa"], "\n")


} else {


cat("Colour ", colour, " does not exist in the hash.\n")


}


}




# This works for data frames also 


D < data.frame(m)


D


# Look closely at what happened 


str(D) # The colours are the rownames(D).




# Operations 


D$Africa


D[,"Africa"]


D["yellow",]


# or


subset(D, rownames(D)=="yellow")




colnames(D) < c("Antarctica","America")


D


D$America

16
r/Binary to Decimal.r
Normal file
16
r/Binary to Decimal.r
Normal file
@ 0,0 +1,16 @@


Binary to Decimal


# Program to convert decimal


# number into binary number


# using recursive function




convert_to_binary < function(n) {




if(n > 1) {


convert_to_binary(as.integer(n/2))


}


cat(n %% 2)


}


Output




> convert_to_binary(52)


110100

25
r/Check Armstrong number.r
Normal file
25
r/Check Armstrong number.r
Normal file
@ 0,0 +1,25 @@


Check Armstrong number


# take input from the user


num = as.integer(readline(prompt="Enter a number: "))




# initialize sum


sum = 0




# find the sum of the cube of each digit


temp = num


while(temp > 0) {


digit = temp %% 10


sum = sum + (digit ^ 3)


temp = floor(temp / 10)


}




# display the result


if(num == sum) {


print(paste(num, "is an Armstrong number"))


} else {


print(paste(num, "is not an Armstrong number"))


}


Output 1




Enter a number: 23


[1] "23 is not an Armstrong number"

27
r/Check Leap Year.r
Normal file
27
r/Check Leap Year.r
Normal file
@ 0,0 +1,27 @@


Check Leap Year


# Program to check if


# the input year is


# a leap year or not




year = as.integer(readline(prompt="Enter a year: "))


if((year %% 4) == 0) {


if((year %% 100) == 0) {


if((year %% 400) == 0) {


print(paste(year,"is a leap year"))


} else {


print(paste(year,"is not a leap year"))


}


} else {


print(paste(year,"is a leap year"))


}


} else {


print(paste(year,"is not a leap year"))


}


Output 1




Enter a year: 1900


[1] "1900 is not a leap year"


Output 2




Enter a year: 2000


[1] "2000 is a leap year"

21
r/Check Odd and Even Number.r
Normal file
21
r/Check Odd and Even Number.r
Normal file
@ 0,0 +1,21 @@


Check Odd and Even Number


# Program to check if


# the input number is odd or even.


# A number is even if division


# by 2 give a remainder of 0.


# If remainder is 1, it is odd.




num = as.integer(readline(prompt="Enter a number: "))


if((num %% 2) == 0) {


print(paste(num,"is Even"))


} else {


print(paste(num,"is Odd"))


}


Output 1




Enter a number: 89


[1] "89 is Odd"


Output 2




Enter a number: 0


[1] "0 is Even"

24
r/Check Positive, Negative or Zero.r
Normal file
24
r/Check Positive, Negative or Zero.r
Normal file
@ 0,0 +1,24 @@


Check Positive, Negative or Zero


# In this program, we input a number


# check if the number is positive or


# negative or zero and display


# an appropriate message




num = as.double(readline(prompt="Enter a number: "))


if(num > 0) {


print("Positive number")


} else {


if(num == 0) {


print("Zero")


} else {


print("Negative number")


}


}


Output 1




Enter a number: 9.6


[1] "Negative number"


Output 2




Enter a number: 2


[1] "Positive number"

27
r/Check Prime Number.r
Normal file
27
r/Check Prime Number.r
Normal file
@ 0,0 +1,27 @@


Check Prime Number


# Program to check if


# the input number is


# prime or not




# take input from the user


num = as.integer(readline(prompt="Enter a number: "))




flag = 0


# prime numbers are greater than 1


if(num > 1) {


# check for factors


flag = 1


for(i in 2:(num1)) {


if ((num %% i) == 0) {


flag = 0


break


}


}


}




if(num == 2) flag = 1


if(flag == 1) {


print(paste(num,"is a prime number"))


} else {


print(paste(num,"is not a prime number"))


}

30
r/Compute LCM in R.r
Normal file
30
r/Compute LCM in R.r
Normal file
@ 0,0 +1,30 @@


Compute LCM in R


# Program to find the L.C.M. of two input number


lcm < function(x, y) {


# choose the greater number


if(x > y) {


greater = x


} else {


greater = y


}




while(TRUE) {


if((greater %% x == 0) && (greater %% y == 0)) {


lcm = greater


break


}


greater = greater + 1


}


return(lcm)


}




# take input from the user


num1 = as.integer(readline(prompt = "Enter first number: "))


num2 = as.integer(readline(prompt = "Enter second number: "))




print(paste("The L.C.M. of", num1,"and", num2,"is", lcm(num1, num2)))


Output




Enter first number: 24


Enter second number: 25


[1] "The L.C.M. of 24 and 25 is 600"

@ 0,0 +1,15 @@


# Goals: Do bootstrap inference, as an example, for a sample median.




library(boot)




samplemedian < function(x, d) { # d is a vector of integer indexes


return(median(x[d])) # The genius is in the x[d] notation


}




data < rnorm(50) # Generate a dataset with 50 obs


b < boot(data, samplemedian, R=2000) # 2000 bootstrap replications


cat("Sample median has a sigma of ", sd(b$t[,1]), "\n")


plot(b)




# Make a 99% confidence interval


boot.ci(b, conf=0.99, type="basic")

44
r/Doing OLS.r
Normal file
44
r/Doing OLS.r
Normal file
@ 0,0 +1,44 @@


# Goal: Simulate a dataset from the OLS model and obtain


# obtain OLS estimates for it.




x < runif(100, 0, 10) # 100 draws from U(0,10)


y < 2 + 3*x + rnorm(100) # beta = [2, 3] and sigma = 1




# You want to just look at OLS results?


summary(lm(y ~ x))




# Suppose x and y were packed together in a data frame 


D < data.frame(x,y)


summary(lm(y ~ x, D))




# Full and elaborate steps 


d < lm(y ~ x)


# Learn about this object by saying ?lm and str(d)


# Compact model results 


print(d)


# Pretty graphics for regression diagnostics 


par(mfrow=c(2,2))


plot(d)




d < summary(d)


# Detailed model results 


print(d)


# Learn about this object by saying ?summary.lm and by saying str(d)


cat("OLS gave slope of ", d$coefficients[2,1],


"and a error sigma of ", d$sigma, "\n")






## I need to drop down to a smaller dataset now 


x < runif(10)


y < 2 + 3*x + rnorm(10)


m < lm(y ~ x)




# Now R supplies a wide range of generic functions which extract


# useful things out of the result of estimation of many kinds of models.




residuals(m)


fitted(m)


AIC(m)


AIC(m, k=log(10)) # SBC


vcov(m)


logLik(m)

55
r/Dummy variables in regression.r
Normal file
55
r/Dummy variables in regression.r
Normal file
@ 0,0 +1,55 @@


# Goal: "Dummy variables" in regression.




# Suppose you have this data:


people = data.frame(


age = c(21,62,54,49,52,38),


education = c("college", "school", "none", "school", "college", "none"),


education.code = c( 2, 1, 0, 1, 2, 0 )


)


# Here people$education is a string categorical variable and


# people$education.code is the same thing, with a numerical coding system.


people




# Note the structure of the dataset 


str(people)


# The strings supplied for `education' have been treated (correctly) as


# a factor, but education.code is being treated as an integer and not as


# a factor.






# We want to do a dummy variable regression. Normally you would have:


# 1 Chosen college as the omitted category


# 2 Made a dummy for "none" named educationnone


# 3 Made a dummy for "school" named educationschool


# 4 Ran a regression like lm(age ~ educationnone + educationschool, people)


# But this is R. Things are cool:


lm(age ~ education, people)




# ! :)


# When you feed him an explanatory variable like education, he does all


# these steps automatically. (He chose college as the omitted category).




# If you use an integer coding, then the obvious thing goes wrong 


lm(age ~ education.code, people)


# because he's thinking that education.code is an integer explanatory


# variable. So you need to:




lm(age ~ factor(education.code), people)


# (he choose a different omitted category)




# Alternatively, fix up the dataset 


people$education.code < factor(people$education.code)


lm(age ~ education.code, people)




#


# Bottom line:


# Once the dataset has categorical variables correctly represented as factors, i.e. as


str(people)


# doing OLS in R induces automatic generation of dummy variables while leaving one out:


lm(age ~ education, people)


lm(age ~ education.code, people)




# But what if you want the X matrix?


m < lm(age ~ education, people)


model.matrix(m)


# This is the design matrix that went into the regression m.

38
r/Fibonacci Sequence in R.r
Normal file
38
r/Fibonacci Sequence in R.r
Normal file
@ 0,0 +1,38 @@


Fibonacci Sequence in R


# Program to diplay the Fibonacci


# sequence up to nth term using


# recursive functions




recurse_fibonacci < function(n) {


if(n <= 1) {


return(n)


} else {


return(recurse_fibonacci(n1) + recurse_fibonacci(n2))


}


}




# take input from the user


nterms = as.integer(readline(prompt="How many terms? "))




# check if the number of terms is valid


if(nterms <= 0) {


print("Plese enter a positive integer")


} else {


print("Fibonacci sequence:")


for(i in 0:(nterms1)) {


print(recurse_fibonacci(i))


}


}


Output




How many terms? 9


[1] "Fibonacci sequence:"


[1] 0


[1] 1


[1] 1


[1] 2


[1] 3


[1] 5


[1] 8


[1] 13


[1] 21

14
r/Find Factorial of a number using recursion.r
Normal file
14
r/Find Factorial of a number using recursion.r
Normal file
@ 0,0 +1,14 @@


Find Factorial of a number using recursion


recur_factorial < function(n) {


if(n <= 1) {


return(1)


} else {


return(n * recur_factorial(n1))


}


}






Output




> recur_factorial(5)


[1] 120

33
r/Find Minimum and Maximum.r
Normal file
33
r/Find Minimum and Maximum.r
Normal file
@ 0,0 +1,33 @@


Find Minimum and Maximum


> x


[1] 5 8 3 9 2 7 4 6 10




> # find the minimum


> min(x)


[1] 2




> # find the maximum


> max(x)


[1] 10




> # find the range


> range(x)


[1] 2 10


If we want to find where the minimum or maximum is located, i.e. the index instead of the actual value, then we can use which.min() and which.max() functions.




Note that these functions will return the index of the first minimum or maximum in case multiple of them exists.




> x


[1] 5 8 3 9 2 7 4 6 10




> # find index of the minimum


> which.min(x)


[1] 5




> # find index of the minimum


> which.max(x)


[1] 9




> # alternate way to find the minimum


> x[which.min(x)]


[1] 2

29
r/Find factors of a number.r
Normal file
29
r/Find factors of a number.r
Normal file
@ 0,0 +1,29 @@


Find factors of a number


print_factors < function(x) {


print(paste("The factors of",x,"are:"))


for(i in 1:x) {


if((x %% i) == 0) {


print(i)


}


}


}


Output




> print_factors(120)


[1] "The factors of 120 are:"


[1] 1


[1] 2


[1] 3


[1] 4


[1] 5


[1] 6


[1] 8


[1] 10


[1] 12


[1] 15


[1] 20


[1] 24


[1] 30


[1] 40


[1] 60


[1] 120

21
r/Find sum of natural numbers without formula.r
Normal file
21
r/Find sum of natural numbers without formula.r
Normal file
@ 0,0 +1,21 @@


Find sum of natural numbers without formula


# take input from the user


num = as.integer(readline(prompt = "Enter a number: "))




if(num < 0) {


print("Enter a positive number")


} else {


sum = 0




# use while loop to iterate until zero


while(num > 0) {


sum = sum + num


num = num  1


}




print(paste("The sum is", sum))


}


Output




Enter a number: 10


[1] "The sum is 55"

20
r/Find the factorial of a number.r
Normal file
20
r/Find the factorial of a number.r
Normal file
@ 0,0 +1,20 @@


#Find the factorial of a number


# take input from the user


num = as.integer(readline(prompt="Enter a number: "))


factorial = 1




# check is the number is negative, positive or zero


if(num < 0) {


print("Sorry, factorial does not exist for negative numbers")


} else if(num == 0) {


print("The factorial of 0 is 1")


} else {


for(i in 1:num) {


factorial = factorial * i


}


print(paste("The factorial of", num ,"is",factorial))


}


Output




Enter a number: 8


[1] "The factorial of 8 is 40320"

8
r/From Uniform Distribution.r
Normal file
8
r/From Uniform Distribution.r
Normal file
@ 0,0 +1,8 @@


> runif(1) # generates 1 random number


[1] 0.3984754




> runif(3) # generates 3 random number


[1] 0.8090284 0.1797232 0.6803607




> runif(3, min=5, max=10) # define the range between 5 and 10


[1] 7.099781 8.355461 5.173133

22
r/Handling missing data.r
Normal file
22
r/Handling missing data.r
Normal file
@ 0,0 +1,22 @@


# Goal:


# A stock is traded on 2 exchanges.


# Price data is missing at random on both exchanges owing to nontrading.


# We want to make a single price timeseries utilising information


# from both exchanges. I.e., missing data for exchange 1 will


# be replaced by information for exchange 2 (if observed).




# Let's create some example data for the problem.


e1 < runif(15) # Prices on exchange 1


e2 < e1 + 0.05*rnorm(15) # Prices on exchange 2.


cbind(e1, e2)


# Blow away 5 points from each at random.


e1[sample(1:15, 5)] < NA


e2[sample(1:15, 5)] < NA


cbind(e1, e2)




# Now how do we reconstruct a timeseries that tries to utilise both?


combined < e1 # Do use the more liquid exchange here.


missing < is.na(combined)


combined[missing] < e2[missing] # if it's also missing, I don't care.


cbind(e1, e2, combined)


# There you are.

@ 0,0 +1,43 @@


# Goal: Joint distributions, marginal distributions, useful tables.




# First let me invent some fake data


set.seed(102) # This yields a good illustration.


x < sample(1:3, 15, replace=TRUE)


education < factor(x, labels=c("None", "School", "College"))


x < sample(1:2, 15, replace=TRUE)


gender < factor(x, labels=c("Male", "Female"))


age < runif(15, min=20,max=60)




D < data.frame(age, gender, education)


rm(x,age,gender,education)


print(D)




# Table about education


table(D$education)




# Table about education and gender 


table(D$gender, D$education)


# Joint distribution of education and gender 


table(D$gender, D$education)/nrow(D)




# Add in the marginal distributions also


addmargins(table(D$gender, D$education))


addmargins(table(D$gender, D$education))/nrow(D)




# Generate a good LaTeX table out of it 


library(xtable)


xtable(addmargins(table(D$gender, D$education))/nrow(D),


digits=c(0,2,2,2,2)) # You have to do  and \hline manually.




# Study age by education category


by(D$age, D$gender, mean)


by(D$age, D$gender, sd)


by(D$age, D$gender, summary)




# Twoway table showing average age depending on education & gender


a < matrix(by(D$age, list(D$gender, D$education), mean), nrow=2)


rownames(a) < levels(D$gender)


colnames(a) < levels(D$education)


print(a)


# or, of course,


print(xtable(a))

@ 0,0 +1,23 @@


# Goals: Simulate a dataset from a "fixed effects" model, and


# obtain "least squares dummy variable" (LSDV) estimates.


#


# We do this in the context of a familiar "earnings function" 


# log earnings is quadratic in log experience, with parallel shifts by


# education category.




# Create an education factor with 4 levels 


education < factor(sample(1:4,1000, replace=TRUE),


labels=c("none", "school", "college", "beyond"))


# Simulate an experience variable with a plausible range 


experience < 30*runif(1000) # experience from 0 to 20 years


# Make the intercept vary by education category between 4 given values 


intercept < c(0.5,1,1.5,2)[education]




# Simulate the log earnings 


log.earnings < intercept +


2*experience  0.05*experience*experience + rnorm(1000)


A < data.frame(education, experience, e2=experience*experience, log.earnings)


summary(A)




# The OLS path to LSDV 


summary(lm(log.earnings ~ 1 + education + experience + e2, A))

31
r/Make a timeseries object using the zoo package.r
Normal file
31
r/Make a timeseries object using the zoo package.r
Normal file
@ 0,0 +1,31 @@


# Goal: Make a timeseries object using the "zoo" package




A < data.frame(date=c("19950101", "19950102", "19950103", "19950106"),


x=runif(4),


y=runif(4))


A$date < as.Date(A$date) # yyyymmdd is the default format


# So far there's nothing new  it's just a data frame. I have hand


# constructed A but you could equally have obtained it using read.table().




# I want to make a zoo matrix out of the numerical columns of A


library(zoo)


B < A


B$date < NULL


z < zoo(as.matrix(B), order.by=A$date)


rm(A, B)




# So now you are holding "z", a "zoo" object. You can do many cool


# things with it.


# See http://www.google.com/search?hl=en&q=zoo+quickref+achim&btnI=I%27m+Feeling+Lucky




# To drop down to a plain data matrix, say


C < coredata(z)


rownames(C) < as.character(time(z))


# Compare 


str(C)


str(z)




# The above is a tedious way of doing these things, designed to give you


# an insight into what is going on. If you just want to read a file


# into a zoo object, a very short path is something like:


# z < read.zoo(filename, format="%d %b %Y")

20
r/Make pictures in PDF files that can be put into a paper.r
Normal file
20
r/Make pictures in PDF files that can be put into a paper.r
Normal file
@ 0,0 +1,20 @@


# Goal: Make pictures in PDF files that can be put into a paper.




xpts < seq(3,3,.05)




# Here is my suggested setup for a twocolumn picture 


pdf("demo2.pdf", width=5.6, height=2.8, bg="cadetblue1", pointsize=8)


par(mai=c(.6,.6,.2,.2))


plot(xpts, sin(xpts*xpts), type="l", lwd=2, col="cadetblue4",


xlab="x", ylab="sin(x*x)")


grid(col="white", lty=1, lwd=.2)


abline(h=0, v=0)




# My suggested setup for a square onecolumn picture 


pdf("demo1.pdf", width=2.8, height=2.8, bg="cadetblue1", pointsize=8)


par(mai=c(.6,.6,.2,.2))


plot(xpts, sin(xpts*xpts), type="l", lwd=2, col="cadetblue4",


xlab="x", ylab="sin(x*x)")


grid(col="white", lty=1, lwd=.2)


abline(h=0, v=0)



25
r/Multiplication Table.r
Normal file
25
r/Multiplication Table.r
Normal file
@ 0,0 +1,25 @@


Multiplication Table


# Program to find the multiplication


# table (from 1 to 10)


# of a number input by the user




# take input from the user


num = as.integer(readline(prompt = "Enter a number: "))




# use for loop to iterate 10 times


for(i in 1:10) {


print(paste(num,'x', i, '=', num*i))


}


Output




Enter a number: 7


[1] "7 x 1 = 7"


[1] "7 x 2 = 14"


[1] "7 x 3 = 21"


[1] "7 x 4 = 28"


[1] "7 x 5 = 35"


[1] "7 x 6 = 42"


[1] "7 x 7 = 49"


[1] "7 x 8 = 56"


[1] "7 x 9 = 63"


[1] "7 x 10 = 70"

@ 0,0 +1,13 @@


# Goal: Display two series on one plot, one with a left y axis


# and another with a right y axis.




y1 < cumsum(rnorm(100))


y2 < cumsum(rnorm(100, mean=0.2))




par(mai=c(.8, .8, .2, .8))


plot(1:100, y1, type="l", col="blue", xlab="X axis label", ylab="Left legend")


par(new=TRUE)


plot(1:100, y2, type="l", ann=FALSE, yaxt="n")


axis(4)


legend(x="topleft", bty="n", lty=c(1,1), col=c("blue","black"),


legend=c("String 1 (left scale)", "String 2 (right scale)"))

99
r/Prices and returns.r
Normal file
99
r/Prices and returns.r
Normal file
@ 0,0 +1,99 @@


# Goal: Prices and returns




# I like to multiply returns by 100 so as to have "units in percent".


# In other words, I like it for 5% to be a value like 5 rather than 0.05.




###################################################################


# I. Simulate randomwalk prices, switch between prices & returns.


###################################################################


# Simulate a timeseries of PRICES drawn from a random walk


# where oneperiod returns are i.i.d. N(mu, sigma^2).


ranrw < function(mu, sigma, p0=100, T=100) {


cumprod(c(p0, 1 + (rnorm(n=T, mean=mu, sd=sigma)/100)))


}


prices2returns < function(x) {


100*diff(log(x))


}


returns2prices < function(r, p0=100) {


c(p0, p0 * exp(cumsum(r/100)))


}




cat("Simulate 25 points from a random walk starting at 1500 \n")


p < ranrw(0.05, 1.4, p0=1500, T=25)


# gives you a 25long series, starting with a price of 1500, where


# oneperiod returns are N(0.05,1.4^2) percent.


print(p)




cat("Convert to returns\n")


r < prices2returns(p)


print(r)




cat("Go back from returns to prices \n")


goback < returns2prices(r, 1500)


print(goback)




###################################################################


# II. Plenty of powerful things you can do with returns....


###################################################################


summary(r); sd(r) # summary statistics


plot(density(r)) # kernel density plot


acf(r) # Autocorrelation function


ar(r) # Estimate a AICminimising AR model


Box.test(r, lag=2, type="Ljung") # BoxLjung test


library(tseries)


runs.test(factor(sign(r))) # Runs test


bds.test(r) # BDS test.




###################################################################


# III. Visualisation and the random walk


###################################################################


# I want to obtain intuition into what kinds of price series can happen,


# given a starting price, a mean return, and a given standard deviation.


# This function simulates out 10000 days of a price timeseries at a time,


# and waits for you to click in the graph window, after which a second


# series is painted, and so on. Make the graph window very big and


# sit back and admire.


# The point is to eyeball many series and thus obtain some intuition


# into what the random walk does.


visualisation < function(p0, s, mu, labelstring) {


N < 10000


x < (1:(N+1))/250 # Unit of years


while (1) {


plot(x, ranrw(mu, s, p0, N), ylab="Level", log="y",


type="l", col="red", xlab="Time (years)",


main=paste("40 years of a process much like", labelstring))


grid()


z=locator(1)


}


}




# Nifty  assuming sigma of 1.4% a day and E(returns) of 13% a year


visualisation(2600, 1.4, 13/250, "Nifty")




# The numerical values here are used to think about what the INR/USD


# exchange rate would have looked like if it started from 31.37, had


# a mean depreciation of 5% per year, and had the daily vol of a floating


# exchange rate like EUR/USD.


visualisation(31.37, 0.7, 5/365, "INR/USD (NOT!) with daily sigma=0.7")


# This is of course not like the INR/USD series in the real world 


# which is neither a random walk nor does it have a vol of 0.7% a day.




# The numerical values here are used to think about what the USD/EUR


# exchange rate, starting with 1, having no drift, and having the observed


# daily vol of 0.7. (This is about right).


visualisation(1, 0.7, 0, "USD/EUR with no drift")




###################################################################


# IV. A monte carlo experiment about the runs test


###################################################################


# Measure the effectiveness of the runs test when faced with an


# AR(1) process of length 100 with a coeff of 0.1


set.seed(101)


one.ts < function() {arima.sim(list(order = c(1,0,0), ar = 0.1), n=100)}


table(replicate(1000, runs.test(factor(sign(one.ts())))$p.value < 0.05))


# We find that the runs test throws up a prob value of below 0.05


# for 91 out of 1000 experiments.


# Wow! :)


# To understand this, you need to look up the man pages of:


# set.seed, arima.sim, sign, factor, runs.test, replicate, table.


# e.g. say ?replicate

41
r/Print Fibonacci Sequence.r
Normal file
41
r/Print Fibonacci Sequence.r
Normal file
@ 0,0 +1,41 @@


Print Fibonacci Sequence


# take input from the user


nterms = as.integer(readline(prompt="How many terms? "))




# first two terms


n1 = 0


n2 = 1


count = 2




# check if the number of terms is valid


if(nterms <= 0) {


print("Plese enter a positive integer")


} else {


if(nterms == 1) {


print("Fibonacci sequence:")


print(n1)


} else {


print("Fibonacci sequence:")


print(n1)


print(n2)


while(count < nterms) {


nth = n1 + n2


print(nth)


# update values


n1 = n2


n2 = nth


count = count + 1


}


}


}


Output




How many terms? 7


[1] "Fibonacci sequence:"


[1] 0


[1] 1


[1] 1


[1] 2


[1] 3


[1] 5


[1] 8

30
r/Program to Find GCD.r
Normal file
30
r/Program to Find GCD.r
Normal file
@ 0,0 +1,30 @@


Program to Find GCD


# Program to find the


# H.C.F of two input number




# define a function


hcf < function(x, y) {


# choose the smaller number


if(x > y) {


smaller = y


} else {


smaller = x


}


for(i in 1:smaller) {


if((x %% i == 0) && (y %% i == 0)) {


hcf = i


}


}


return(hcf)


}




# take input from the user


num1 = as.integer(readline(prompt = "Enter first number: "))


num2 = as.integer(readline(prompt = "Enter second number: "))




print(paste("The H.C.F. of", num1,"and", num2,"is", hcf(num1, num2)))


Output




Enter first number: 72


Enter second number: 120


[1] "The H.C.F. of 72 and 120 is 24"

52
r/Quartiles deciles tables graphs.r
Normal file
52
r/Quartiles deciles tables graphs.r
Normal file
@ 0,0 +1,52 @@


# Get the data in place 


load(file="demo.rda")


summary(firms)




# Look at it 


plot(density(log(firms$mktcap)))


plot(firms$mktcap, firms$spread, type="p", cex=.2, col="blue", log="xy",


xlab="Market cap (Mln USD)", ylab="Bid/offer spread (bps)")


m=lm(log(spread) ~ log(mktcap), firms)


summary(m)




# Making deciles 


library(gtools)


library(gdata)


# for deciles (default=quartiles)


size.category = quantcut(firms$mktcap, q=seq(0, 1, 0.1), labels=F)


table(size.category)


means = aggregate(firms, list(size.category), mean)


print(data.frame(means$mktcap,means$spread))




# Make a picture combining the sample mean of spread (in each decile)


# with the weighted average sample mean of the spread (in each decile),


# where weights are proportional to size.


wtd.means = by(firms, size.category,


function(piece) (sum(piece$mktcap*piece$spread)/sum(piece$mktcap)))


lines(means$mktcap, means$spread, type="b", lwd=2, col="green", pch=19)


lines(means$mktcap, wtd.means, type="b", lwd=2, col="red", pch=19)


legend(x=0.25, y=0.5, bty="n",


col=c("blue", "green", "red"),


lty=c(0, 1, 1), lwd=c(0,2,2),


pch=c(0,19,19),


legend=c("firm", "Mean spread in size deciles",


"Size weighted mean spread in size deciles"))




# Within group standard deviations 


aggregate(firms, list(size.category), sd)




# Now I do quartiles by BOTH mktcap and spread.


size.quartiles = quantcut(firms$mktcap, labels=F)


spread.quartiles = quantcut(firms$spread, labels=F)


table(size.quartiles, spread.quartiles)


# Reexpress everything as joint probabilities


table(size.quartiles, spread.quartiles)/nrow(firms)


# Compute cell means at every point in the joint table:


aggregate(firms, list(size.quartiles, spread.quartiles), mean)




# Make pretty twoway tables


aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, nobs)


aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, mean)


aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, sd)


aggregate.table(firms$spread, size.quartiles, spread.quartiles, mean)


aggregate.table(firms$spread, size.quartiles, spread.quartiles, sd)

11
r/R Hello World Program.r
Normal file
11
r/R Hello World Program.r
Normal file
@ 0,0 +1,11 @@


> # We can use the print() function


> print("Hello World!")


[1] "Hello World!"




> # Quotes can be suppressed in the output


> print("Hello World!", quote = FALSE)


[1] Hello World!




> # If there are more than 1 item, we can concatenate using paste()


> print(paste("How","are","you?"))


[1] "How are you?"

@ 0,0 +1,12 @@


# Goal: R syntax where model specification is an argument to a function.




# Invent a dataset


x < runif(100); y < runif(100); z < 2 + 3*x + 4*y + rnorm(100)


D < data.frame(x=x, y=y, z=z)




amodel < function(modelstring) {


summary(lm(modelstring, D))


}




amodel(z ~ x)


amodel(z ~ y)

3
r/README.md
Normal file
3
r/README.md
Normal file
@ 0,0 +1,3 @@


# R programming language




R is a programming language designed for statistical computing and graphics purposes. Contains code that can be executed within the R software environment.

@ 0,0 +1,43 @@


# Goal: Reading and writing ascii files, reading and writing binary files.


# And, to measure how much faster it is working with binary files.




# First manufacture a tall data frame:


# FYI  runif(10) yields 10 U(0,1) random numbers.


B = data.frame(x1=runif(100000), x2=runif(100000), x3=runif(100000))


summary(B)




# Write out ascii file:


write.table(B, file = "/tmp/foo.csv", sep = ",", col.names = NA)


# Read in this resulting ascii file:


C=read.table("/tmp/foo.csv", header = TRUE, sep = ",", row.names=1)


# Write a binary file out of dataset C:


save(C, file="/tmp/foo.binary")


# Delete the dataset C:


rm(C)


# Restore from foo.binary:


load("/tmp/foo.binary")


summary(C) # should yield the same results


# as summary(B) above.






# Now we time all these operations 


cat("Time creation of dataset:\n")


system.time({


B = data.frame(x1=runif(100000), x2=runif(100000), x3=runif(100000))


})




cat("Time writing an ascii file out of dataset B:\n")


system.time(


write.table(B, file = "/tmp/foo.csv", sep = ",", col.names = NA)


)




cat("Time reading an ascii file into dataset C:\n")


system.time(


{C=read.table("/tmp/foo.csv", header = TRUE, sep=",", row.names=1)


})




cat("Time writing a binary file out of dataset C:\n")


system.time(save(C, file="/tmp/foo.binary"))




cat("Time reading a binary file + variablenames from /tmp/foo.binary:\n")


system.time(load("/tmp/foo.binary")) # and then read it in from binary file

15
r/Reading in a Microsoft .r
Normal file
15
r/Reading in a Microsoft .r
Normal file
@ 0,0 +1,15 @@


# Goal: Reading in a Microsoft .xls file directly




library(gdata)


a < read.xls("file.xls", sheet=2) # This reads in the 2nd sheet




# Look at what the cat dragged in


str(a)




# If you have a date column, you'll want to fix it up like this:


a$date < as.Date(as.character(a$X), format="%d%b%y")


a$X < NULL






# Also see http://tolstoy.newcastle.edu.au/R/help/06/04/25674.html for


# another path.

17
r/Reading in a file made by CMIE's Business Beacon program.r
Normal file
17
r/Reading in a file made by CMIE's Business Beacon program.r
Normal file
@ 0,0 +1,17 @@


# Goal: To read in files produced by CMIE's "Business Beacon".


# This assumes you have made a file of MONTHLY data using CMIE's


# Business Beacon program. This contains 2 columns: M3 and M0.




A < read.table(


# Generic to all BB files 


sep="", # CMIE's .txt file is pipe delimited


skip=3, # Skip the 1st 3 lines


na.strings=c("N.A.","Err"), # The ways they encode missing data


# Specific to your immediate situation 


file="bb_data.text",


col.names=c("junk", "date", "M3", "M0")


)


A$junk < NULL # Blow away this column




# Parse the CMIEstyle "Mmm yy" date string that's used on monthly data


A$date < as.Date(paste("1", as.character(A$date)), format="%d %b %Y")

16
r/Sample From a Population.r
Normal file
16
r/Sample From a Population.r
Normal file
@ 0,0 +1,16 @@


> # sample with replacement


> sample(x, replace = TRUE)


[1] 15 17 13 9 5 15 11 15 1




> # if we simply pass in a positive number n, it will sample


> # from 1:n without replacement


> sample(10)


[1] 2 4 7 9 1 3 10 5 8 6










An example to simulate a coin toss for 10 times.




> sample(c("H","T"),10, replace = TRUE)


[1] "H" "H" "H" "T" "H" "T" "H" "H" "H" "T"

@ 0,0 +1,40 @@


# Goals: Scare the hell out of children with the Cauchy distribution.




# A function which simulates N draws from one of two distributions,


# and returns the mean obtained thusly.


one.simulation < function(N=100, distribution="normal") {


if (distribution == "normal") {


x < rnorm(N)


} else {


x < rcauchy(N)


}


mean(x)


}




k1 < density(replicate(1000, one.simulation(20)))


k2 < density(replicate(1000, one.simulation(20, distribution="cauchy")))




xrange < range(k1$x, k2$x)


plot(k1$x, k1$y, xlim=xrange, type="l", xlab="Estimated value", ylab="")


grid()


lines(k2$x, k2$y, col="red")


abline(v=.5)


legend(x="topleft", bty="n",


lty=c(1,1),


col=c("black", "red"),


legend=c("Mean of Normal", "Mean of Cauchy"))


# The distribution of the mean of normals collapses into a point;


# that of the cauchy does not.




# Here's more scary stuff 


for (i in 1:10) {


cat("Sigma of distribution of 1000 draws from mean of normal  ",


sd(replicate(1000, one.simulation(20))), "\n")


}


for (i in 1:10) {


cat("Sigma of distribution of 1000 draws from mean of cauchy  ",


sd(replicate(1000, one.simulation(20, distribution="cauchy"))), "\n")


}




# Exercise for the reader: Compare the distribution of the median of


# the Normal against the distribution of the median of the Cauchy.

@ 0,0 +1,33 @@


# Goals: Lots of times, you need to give an R object to a friend,


# or embed data into an email.




# First I invent a little dataset 


set.seed(101) # To make sure you get the same random numbers as me


# FYI  runif(10) yields 10 U(0,1) random numbers.


A = data.frame(x1=runif(10), x2=runif(10), x3=runif(10))


# Look at it 


print(A)




# Writing to a binary file that can be transported


save(A, file="/tmp/my_data_file.rda") # You can give this file to a friend


load("/tmp/my_data_file.rda")




# Plan B  you want pure ascii, which can be put into an email 


dput(A)


# This gives you a block of R code. Let me utilise that generated code


# to create a dataset named "B".


B < structure(list(x1 = c(0.372198376338929, 0.0438248154241592,
