#Load and modify data library(ggplot2) library(dplyr) View(diamonds) #only use subset of color and clarity categories diamonds2 <-diamonds[diamonds$color %in% c("D","E","F"),] diamonds2 <-diamonds2[diamonds2$clarity %in% c("SI1","SI2","VS1","VS2"),] diamonds2 <-droplevels(diamonds2) #use log price rather than raw price diamonds2$ln_price <-round(log(diamonds2$price),2) View(diamonds2) #------------------------------------------------------------------------------- #Determine Variable types and get overall shape of data summary(diamonds2) #Y-variables summary(diamonds2$ln_price) #X-variables #continuous variables summary(diamonds2$carat) summary(diamonds2$depth) #discrete variables table(diamonds2$cut) table(diamonds2$color) table(diamonds2$clarity) #discrete variables by price tapply(diamonds2$ln_price, diamonds2$cut, mean) tapply(diamonds2$ln_price, diamonds2$color, mean) tapply(diamonds2$ln_price, diamonds2$clarity, mean) #discrete variables by each other table(diamonds2$cut, diamonds2$color) table(diamonds2$cut, diamonds2$clarity) #------------------------------------------------------------------------------- #Determine formula and get Dirty plots plot(diamonds2$ln_price ~ diamonds2$carat) plot(diamonds2$ln_price ~ diamonds2$carat, col=diamonds2$color) boxplot(diamonds2$ln_price ~ diamonds2$cut) boxplot(diamonds2$ln_price ~ diamonds2$color + diamonds2$clarity) #------------------------------------------------------------------------------- #Get Clean plots and Publication considerations p1 <-ggplot(data=diamonds2, aes(y=ln_price, x=carat))+ geom_point() + geom_smooth(method='loess') + labs(x="Carot", y="Log price in dollars") p1 p2 <-ggplot(data=diamonds2, aes(y=ln_price, x=carat, fill=color, color=color))+ geom_point() + geom_smooth(method='loess') + labs(x="Carot", y="Log price in dollars") p2 #------------------------------------------------------------------------------- d_sum<-diamonds2 %>% group_by(cut) %>% summarise(mean=mean(ln_price), sd = sd(ln_price), error = qt(0.975,df=n()-1)*sd/sqrt(n()), ul = mean + error, ll = mean - error) p3 <-ggplot(data=d_sum, aes(y=mean, x=cut, fill=cut))+ geom_bar(stat="identity") + geom_errorbar(aes(ymin=ll, ymax=ul), width=0.1)+ labs(y="Mean log price in dollars") p3 #------------------------------------------------------------------------------- d_sum2<-diamonds2 %>% group_by(clarity,color) %>% summarise(mean=mean(ln_price), sd = sd(ln_price), error = qt(0.975,df=n()-1)*sd/sqrt(n()), ul = mean + error, ll = mean - error) p4 <-ggplot(data=d_sum2, aes(y=mean, x=clarity, fill=color)) + geom_bar(aes(fill=color), stat="identity", position=position_dodge()) + geom_errorbar(aes(ymin=ll, ymax=ul, group=color), width=0.1, position=position_dodge(0.9))+ labs(y="Mean log price in dollars") p4