Revision | 99d97bde6db7c8945ce8b535b76b07d3a321c020 (tree) |
---|---|
Zeit | 2018-04-26 05:38:19 |
Autor | Lorenzo Isella <lorenzo.isella@gmai...> |
Commiter | Lorenzo Isella |
I did some extra work on the presentation.
@@ -132,7 +132,7 @@ | ||
132 | 132 | % Compile with Rscript -e "library(knitr); knit('./R-intro-code.Rnw')" |
133 | 133 | |
134 | 134 | |
135 | -\title{Introduction to R} | |
135 | +\title{Introduction to R (with Hands on Applications!)} | |
136 | 136 | \framesubtitle{A researcher's perspective} |
137 | 137 | \author{ {Lorenzo Isella}} |
138 | 138 | \institute{DG TRADE, G2, Chief Economist Team} |
@@ -171,7 +171,7 @@ | ||
171 | 171 | |
172 | 172 | |
173 | 173 | \begin{frame} |
174 | -\frametitle{Dulcis in Fundo} | |
174 | +\frametitle{What to Expect from this Training} | |
175 | 175 | % \framesubtitle{Test Frame} |
176 | 176 | % \subt{An optional subtitle} |
177 | 177 | On the other hand, by the end of this training you will |
@@ -191,14 +191,15 @@ | ||
191 | 191 | |
192 | 192 | \frametitle{Overview of the Training} |
193 | 193 | \begin{itemize} |
194 | - | |
195 | - \item We will go through the basics of R (data types, structures, | |
196 | - functions, etc...) | |
197 | - \item we will also cover in some detail some fairly advanced topics | |
198 | - and recently added libraries (tidyverse) | |
199 | - \item R is evolving rapidly and an introductory training delivered | |
200 | - 3 years ago would be out of sync with the reality of active users | |
201 | - in 2018. | |
194 | + \item Philosophy of the training: your goal is to get better, | |
195 | + faster and more productive at data analysis. | |
196 | + \item you are not interested in the 6 different kinds of atomic | |
197 | + vectors in R. | |
198 | +\item So we will go head over heels on the basics and | |
199 | + \item plunge into the tidyverse. Tidyverse is a collection of tools | |
200 | + for powerful and expressive data analysis and visualisation. | |
201 | + \item we will barely scratch the surface of many topics, but you | |
202 | + will have an idea of the state-of-the art R for data mining. | |
202 | 203 | |
203 | 204 | \end{itemize} |
204 | 205 |
@@ -299,7 +300,7 @@ | ||
299 | 300 | and computing the average distance (dist = mean(distance, na.rm = |
300 | 301 | TRUE)) and arrival delay (delay = mean(arr{\verb|_|}delay, na.rm = TRUE)). |
301 | 302 | |
302 | -<< highlight=T, eval=TRUE,message=F>>= | |
303 | +<< highlight=T, eval=TRUE,message=F >>= | |
303 | 304 | library(nycflights13) |
304 | 305 | library(tidyverse) |
305 | 306 | by_tailnum <- group_by(flights, tailnum) |
@@ -322,7 +323,9 @@ | ||
322 | 323 | \item logical: TRUE, FALSE |
323 | 324 | \item complex: 1+4i (complex numbers with real and imaginary parts) |
324 | 325 | \end{itemize} |
325 | -You can also create your own, but we will not discuss this in these notes. | |
326 | +You can also create your own data types and/or, but we will not discuss this | |
327 | +in these notes. Later on, we will meet the tibbles -- the tidyverse | |
328 | +reinterpretation of the basic R data frames. | |
326 | 329 | \end{frame} |
327 | 330 | |
328 | 331 | \begin{frame}[fragile] |
@@ -390,6 +393,7 @@ | ||
390 | 393 | \frametitle{Lists 1/3} |
391 | 394 | A list generalises the idea of a vector. It can hold items of |
392 | 395 | different types. The name tag is optional |
396 | +\vspace*{-0.2cm} | |
393 | 397 | |
394 | 398 | << eval=TRUE, highlight=F>>= |
395 | 399 | Lst <- list(name="Fred", wife="Mary", |
@@ -496,23 +500,74 @@ | ||
496 | 500 | ``\verb|<-|'' to assign values. |
497 | 501 | See for instance |
498 | 502 | << eval=TRUE, highlight=T>>= |
499 | -x <- seq(5) | |
500 | -x | |
503 | +x <- c(1,2,3) | |
501 | 504 | x[2] <- -4 |
502 | 505 | x |
506 | +#and sometimes the puzzling | |
507 | +y =2 | |
508 | +y= y +7 # new y = old y +7 | |
509 | +y | |
503 | 510 | @ |
504 | 511 | |
505 | 512 | |
506 | 513 | \end{frame} |
507 | 514 | |
515 | + | |
508 | 516 | \begin{frame}[fragile] |
509 | -\frametitle{Functions in R 1/} | |
517 | +\frametitle{Mutability of Data Structures -- Small Caveat} | |
518 | +We saw that ``='' can be used to assign a value. Instead, ``=='' | |
519 | +is a \underline{logical} operator that checks if | |
520 | +two values/objects are identical. | |
521 | +See for instance | |
522 | +<< eval=TRUE, highlight=T>>= | |
523 | +x = 2 | |
524 | +x | |
525 | +x == 2 | |
526 | +x == 3 | |
527 | +@ | |
510 | 528 | |
511 | 529 | |
512 | 530 | \end{frame} |
513 | 531 | |
514 | 532 | |
515 | 533 | \begin{frame}[fragile] |
534 | +\frametitle{Functions in R 1/2} | |
535 | +A function is defined by an assignment of the form | |
536 | +<< eval=F, highlight=T >>= | |
537 | +name <- function(arg_1, arg_2, …) expression | |
538 | +@ | |
539 | +The expression is an R expression that uses the arguments, arg\verb|_|i, to calculate a value. The value of the expression is the value returned for the function. | |
540 | +mean(), sum(), cumsum(), c(), are examples of R in-built functions we have | |
541 | +already met. | |
542 | +\end{frame} | |
543 | + | |
544 | +\begin{frame}[fragile] | |
545 | +\frametitle{Functions in R 2/2} | |
546 | +Example functions of one and two variables. | |
547 | + | |
548 | +<< eval=T, highlight=T >>= | |
549 | + | |
550 | +double <- function(x){ x*2} | |
551 | + | |
552 | +double_and_triple <- function(x,y) {c(x*2, y*3) } | |
553 | + | |
554 | +a <-7 | |
555 | +b <- 5 | |
556 | +double(a) | |
557 | +double_and_triple(a,b) | |
558 | +@ | |
559 | + | |
560 | +\end{frame} | |
561 | + | |
562 | + | |
563 | +% \begin{frame}[fragile] | |
564 | +% \frametitle{Functions in R 3/2} | |
565 | +% A technical remark: functions do \underline{not} modify their own arguments | |
566 | + | |
567 | +% \end{frame} | |
568 | + | |
569 | + | |
570 | +\begin{frame}[fragile] | |
516 | 571 | \frametitle{Data Input and Output in R} |
517 | 572 | \begin{itemize} |
518 | 573 | \item R provides a number of facilities to import external data in different |
@@ -521,21 +576,133 @@ | ||
521 | 576 | input/output by Excel. For importing and manipulating data, I recommend the |
522 | 577 | tidyverse library. |
523 | 578 | \end{itemize} |
524 | -If you just use base R, you can do | |
525 | -<< eval=F, highlight=T>>= | |
526 | -mydata<-read.csv(''filename.csv'') | |
527 | -@ | |
528 | - | |
529 | -or with the tidyverse library | |
530 | - | |
531 | 579 | << eval=F, highlight=T>>= |
532 | 580 | library(tidyverse) |
533 | -mydata<-read_csv(''filename.csv'') | |
581 | +# read data | |
582 | +mydata<-read_csv("filename.csv") | |
583 | +# write data | |
584 | +write_csv(mydata, "my_output_data.csv") | |
534 | 585 | @ |
535 | 586 | |
536 | 587 | \end{frame} |
537 | 588 | |
538 | 589 | |
590 | + | |
591 | +\begin{frame}[fragile] | |
592 | +\frametitle{Long Computations in R} | |
593 | + | |
594 | +R is a functional language, which means that your code often contains a lot of parenthesis, ( and ). When you have complex code, this often will mean that you will have to nest those parentheses together. This makes your R code hard to read and understand. | |
595 | +<< eval=T, highlight=T>>= | |
596 | +## generate some arbitrary data | |
597 | +x<-c(1e4, 1.1e4, 2.3e4, 1.8e4,7e4,4.1e4) | |
598 | +# Compute the logarithm of `x`, return suitably | |
599 | +# lagged and iterated differences, | |
600 | +# compute the exponential function | |
601 | +# and round the result | |
602 | +round(exp(diff(log(x))), 1) | |
603 | +@ | |
604 | + | |
605 | +\end{frame} | |
606 | + | |
607 | + | |
608 | +% \begin{frame}[fragile] | |
609 | +% \frametitle{Long Computations in R} | |
610 | +% Computations can often result in expressions which are hard to read. | |
611 | +% << eval=T, highlight=T>>= | |
612 | +% ## generate some arbitrary data | |
613 | +% x<-c(1e4, 1.1e4, 2.3e4, 1.8e4,7e4,4.1e4) | |
614 | +% # Compute the logarithm of `x`, return suitably | |
615 | +% # lagged and iterated differences, | |
616 | +% # compute the exponential function | |
617 | +% # and round the result | |
618 | +% round(exp(diff(log(x))), 1) | |
619 | +% @ | |
620 | +% Wouldn't it be nice to have a way to express these operations which is | |
621 | +% easy to read and understand? | |
622 | + | |
623 | + | |
624 | +% \end{frame} | |
625 | + | |
626 | +\begin{frame}[fragile] | |
627 | +\frametitle{Enters the Pipe Operator} | |
628 | +The pipe operator \verb|%>%| has two fundamental properties | |
629 | +\begin{enumerate} | |
630 | + | |
631 | +\item Function $f(x)$ can be rewritten as $x$ \verb|%>%| $f$ | |
632 | +<< eval=T, highlight=F >>= | |
633 | +x <- 10 | |
634 | + # Compute the logarithm of `x` | |
635 | +log(x) | |
636 | +x %>% log() | |
637 | +@ | |
638 | +\item Function $f(x, y)$ can be rewritten as $x$ \verb|%>%| $f(y)$ | |
639 | + | |
640 | +<< eval=T, highlight=F >>= | |
641 | +# Round pi | |
642 | +round(pi, 6) | |
643 | +pi %>% round(6) | |
644 | +@ | |
645 | + | |
646 | + \end{enumerate} | |
647 | + | |
648 | +\end{frame} | |
649 | + | |
650 | +\begin{frame}[fragile] | |
651 | +\frametitle{Why was This Invented at All?} | |
652 | + | |
653 | +The pipe operator \verb|%>%| provides you with a number of benefits | |
654 | +\begin{enumerate} | |
655 | +\item You'll structure the sequence of your data operations from left to right, as apposed to from inside and out; | |
656 | +\item You'll avoid nested function calls; | |
657 | +\item You'll minimize the need for local variables and function definitions; And | |
658 | +\item You'll make it easy to add steps anywhere in the sequence of operations. | |
659 | +\end{enumerate} | |
660 | +<< eval=F, highlight=T >>= | |
661 | +log(sin(sqrt(x))) # becomes | |
662 | +x %>% sqrt() %>% | |
663 | + sin() %>% | |
664 | + log() #much easier to follow! | |
665 | +@ | |
666 | + | |
667 | + | |
668 | +\end{frame} | |
669 | + | |
670 | + | |
671 | +\begin{frame}[fragile] | |
672 | +\frametitle{Application to the Previous Example} | |
673 | +This sounds very abstract, but let us see \verb|%>%| in action | |
674 | +<< eval=T, highlight=T>>= | |
675 | +library(tidyverse) | |
676 | +x<-c(1e4, 1.1e4, 2.3e4, 1.8e4,7e4,4.1e4) | |
677 | +x %>% log() %>% | |
678 | + diff() %>% | |
679 | + exp() %>% | |
680 | + round(1) | |
681 | +@ | |
682 | +Now you finally understand what is going on. Cleaner code is easier to | |
683 | +share and extend. | |
684 | + | |
685 | +\end{frame} | |
686 | + | |
687 | + | |
688 | +\begin{frame}[fragile] | |
689 | +\frametitle{Modify a Sequence of Computations} | |
690 | +Now that the operations are laid out as a sequence, it is much easier to modify them whenever we need to. For instance | |
691 | +<< eval=T, highlight=T>>= | |
692 | +# Compute the logarithm of `x`, return suitably | |
693 | +# lagged and iterated differences, | |
694 | +# compute the mean | |
695 | +# and round the result with two digits | |
696 | +library(tidyverse) | |
697 | +x %>% log() %>% | |
698 | + diff() %>% | |
699 | + mean() %>% | |
700 | + round(2) | |
701 | +@ | |
702 | +\end{frame} | |
703 | + | |
704 | + | |
705 | + | |
539 | 706 | \begin{frame}[fragile] |
540 | 707 | \frametitle{Tidyverse and R} |
541 | 708 | \begin{itemize} |
@@ -549,68 +716,254 @@ | ||
549 | 716 | |
550 | 717 | \end{frame} |
551 | 718 | |
719 | + | |
552 | 720 | \begin{frame}[fragile] |
553 | -\frametitle{Long Computations in R} | |
554 | -Computations can often result in expressions which are hard to read. | |
555 | -<< eval=T, highlight=T>>= | |
556 | -## generate some arbitrary data | |
557 | -x<-c(1e4, 1.1e4, 2.3e4, 1.8e4,7e4,4.1e4) | |
558 | -# Compute the logarithm of `x`, return suitably | |
559 | -# lagged and iterated differences, | |
560 | -# compute the exponential function | |
561 | -# and round the result | |
562 | -round(exp(diff(log(x))), 1) | |
721 | +\frametitle{dplyr -- Data Manipulation 1/2} | |
722 | +dplyr (part of the tidyverse family) is a \underline{grammar of data manipulation}. | |
723 | +When working with data you must | |
724 | +\begin{itemize} | |
725 | +\item Figure out what you want to do. | |
726 | +\item Describe those tasks in the form of a computer program. | |
727 | +\item Execute the program. | |
728 | +\end{itemize} | |
729 | +The dplyr package makes these steps fast and easy | |
730 | +\begin{itemize} | |
731 | + \item By constraining your options, it helps you think about your data manipulation challenges. | |
732 | +\item It provides simple “verbs”, functions that correspond to the most common data manipulation tasks, to help you translate your thoughts into code. | |
733 | +\item It uses efficient backends, so you spend less time waiting for the computer. | |
734 | +\end{itemize} | |
735 | + | |
736 | + % filter() to select cases based on their values. | |
737 | + % arrange() to reorder the cases. | |
738 | + % select() and rename() to select variables based on their names. | |
739 | + % mutate() and transmute() to add new variables that are functions of existing variables. | |
740 | + % summarise() to condense multiple values to a single value. | |
741 | + % sample_n() and sample_frac() to take random samples. | |
742 | + | |
743 | +\end{frame} | |
744 | + | |
745 | +\begin{frame}[fragile] | |
746 | +\frametitle{dplyr -- Data Manipulation 2/2} | |
747 | +dplyr is a grammar because it provides verbs that help you solve the most common data manipulation challenges: | |
748 | +\begin{itemize} | |
749 | +\item mutate() adds new variables that are functions of existing variables | |
750 | +\item select() picks variables based on their names. | |
751 | +\item filter() picks cases based on their values. | |
752 | +\item summarise() reduces multiple values down to a single summary. | |
753 | +\item arrange() changes the ordering of the rows. | |
754 | +\item group\verb|_|by() which allows you to perform any operation ``by group''. | |
755 | +\end{itemize} | |
756 | +This works beautifully with the pipe operator. | |
757 | + | |
758 | +\end{frame} | |
759 | + | |
760 | +\begin{frame}[fragile] | |
761 | +\frametitle{Example with Balance of Payment Data} | |
762 | +<< highlight=T, eval=TRUE,message=F, warning=F >>= | |
763 | +library(tidyverse) | |
764 | +df<-read_csv("bop_flow2.csv") %>% | |
765 | + {.$Value=as.numeric(.$Value) | |
766 | + .} | |
563 | 767 | @ |
564 | -Wouldn't it be nice to have a way to express these operations which is | |
565 | -easy to read and understand? | |
768 | + | |
769 | +Let us glimpse at the resulting table (only a few lines are shown) | |
770 | + | |
771 | +% df<-read_csv("bop_flow2.csv",col_types = cols(Value = "i")) | |
772 | + | |
773 | + | |
774 | + | |
775 | +\begin{table}[ht] | |
776 | +\centering | |
777 | +\scalebox{0.7}{ | |
778 | +\begin{tabular}{rlllll} | |
779 | + \hline | |
780 | +TIME & GEO & CURRENCY & NACE\_R2 & STK\_FLOW & STK\_FLOW\_LABEL \\ | |
781 | + \hline | |
782 | +2016 & EU28 & Million euro & TOTAL & ASS & Assets \\ | |
783 | + 2016 & EU28 & Million euro & TOTAL & ASS & Assets \\ | |
784 | + 2016 & EU28 & Million euro & TOTAL & ASS & Assets \\ | |
785 | + \hline | |
786 | +\end{tabular} | |
787 | +} | |
788 | +\end{table} | |
789 | + | |
790 | +\begin{table}[ht] | |
791 | +\centering | |
792 | +\scalebox{0.7}{ | |
793 | +\begin{tabular}{rllllr} | |
794 | + \hline | |
795 | +TIME & ENTITY & FDI\_ITEM & FDI\_ITEM\_LABEL & PARTNER & Value \\ | |
796 | + \hline | |
797 | +2016 & TOTAL & DO\_\_D\_\_F & Direct investment abroad (DIA) & CH & NA \\ | |
798 | + 2016 & TOTAL & DO\_\_D\_\_F & Direct investment abroad (DIA) & TR & NA \\ | |
799 | + 2016 & TOTAL & DO\_\_D\_\_F & Direct investment abroad (DIA) & RU & NA \\ | |
800 | + \hline | |
801 | +\end{tabular} | |
802 | +} | |
803 | +\end{table} | |
804 | + | |
805 | + | |
806 | + | |
807 | + | |
566 | 808 | |
567 | 809 | |
568 | 810 | \end{frame} |
569 | 811 | |
570 | 812 | \begin{frame}[fragile] |
571 | -\frametitle{Enters the Pipe Operator} | |
572 | -The pipe operator \verb|%>%| is used to transfer the results of a | |
573 | -computation in a pipeline. | |
574 | -<< eval=T, highlight=T>>= | |
813 | +\frametitle{dplyr Verbs in Action 1/4} | |
814 | +In 2015, how many million euros did the EU28 (GEO) invest | |
815 | +(FDI\verb|_|ITEM is DO\verb|_|\verb|_|D\verb|_|\verb|_|F; ENTITY is TOTAL) in manufacture | |
816 | +(NACE\verb|_|R2 is C) in Japan (PARTNER is JP) as outward net foreign | |
817 | +direct investment (STK\verb|_|FLOW is NO)? | |
818 | +<< highlight=T, eval=TRUE,message=F >>= | |
575 | 819 | library(tidyverse) |
576 | -x %>% log() %>% | |
577 | - diff() %>% | |
578 | - exp() %>% | |
579 | - round(1) | |
820 | +manu_JP <- df %>%filter(TIME==2015, GEO=="EU28", | |
821 | + STK_FLOW=="NO",FDI_ITEM=="DO__D__F", | |
822 | + ENTITY=="TOTAL",PARTNER=="JP", NACE_R2=="C") %>% | |
823 | + select(TIME, GEO, PARTNER, NACE_R2, Value) | |
824 | +manu_JP | |
580 | 825 | @ |
581 | -This leads to cleaner code, easier to understand and debug. | |
826 | +\end{frame} | |
827 | + | |
828 | + | |
829 | + | |
830 | +\begin{frame}[fragile] | |
831 | +\frametitle{dplyr Verbs in Action 2/4} | |
832 | +And the total FDI to the US for all years | |
833 | +<< highlight=T, eval=TRUE,message=F >>= | |
834 | +library(tidyverse) | |
835 | +FDI_US <- df %>%filter( GEO=="EU28", | |
836 | +STK_FLOW=="NO",FDI_ITEM=="DO__D__F", | |
837 | +ENTITY=="TOTAL",PARTNER =="US",NACE_R2=="FDI") %>% | |
838 | +select(TIME, GEO, PARTNER, NACE_R2, Value) | |
839 | +FDI_US | |
840 | +@ | |
841 | +\end{frame} | |
842 | + | |
843 | + | |
844 | +\begin{frame}[fragile] | |
845 | +\frametitle{dplyr Verbs in Action 3/4} | |
846 | +And if you want the average FDI to the US along the years | |
847 | +<< highlight=T, eval=TRUE,message=F >>= | |
848 | +library(tidyverse) | |
849 | +FDI_US_mean <- df %>%filter( GEO=="EU28", | |
850 | +STK_FLOW=="NO",FDI_ITEM=="DO__D__F", | |
851 | +ENTITY=="TOTAL",PARTNER =="US", NACE_R2=="FDI")%>% | |
852 | +select(TIME, GEO, PARTNER, NACE_R2, Value) %>% | |
853 | +summarise(mean_FDI_to_US=mean(Value)) | |
854 | +FDI_US_mean | |
855 | +@ | |
856 | +\end{frame} | |
857 | + | |
858 | + | |
859 | +\begin{frame}[fragile] | |
860 | +\frametitle{dplyr Verbs in Action 4/4} | |
861 | +Now you want to do the same for US and India in one go | |
862 | +\vspace*{-0.2cm} | |
863 | +<< highlight=T, eval=TRUE,message=F >>= | |
864 | +library(tidyverse) | |
865 | +FDI_US_IN <- df %>%filter( GEO=="EU28", | |
866 | +STK_FLOW=="NO",FDI_ITEM=="DO__D__F", | |
867 | +ENTITY=="TOTAL",PARTNER %in% c("US", "IN"), | |
868 | +NACE_R2=="FDI")%>% | |
869 | +select(TIME, GEO, PARTNER, NACE_R2, Value) %>% | |
870 | +group_by(PARTNER) %>% | |
871 | +summarise(mean_FDI=mean(Value)) | |
872 | +FDI_US_IN | |
873 | +@ | |
874 | +\end{frame} | |
875 | + | |
876 | + | |
877 | +\begin{frame}[fragile] | |
878 | +\frametitle{dplyr -- Final Thoughts} | |
879 | +\begin{itemize} | |
880 | + \item we barely scratched the surface of dplyr | |
881 | + \item but we have already seen filter, selection of columns and | |
882 | + computing statistics on groups of variables | |
883 | + \item thanks to the pipe operator, most of the code that you write | |
884 | + is reusable and readable | |
885 | + \item you do not worry about cells, indexes etc..., but you think | |
886 | + more about the questions you want to pose to your data. | |
887 | +\end{itemize} | |
888 | +\end{frame} | |
889 | + | |
890 | +\begin{frame}[fragile] | |
891 | +\frametitle{Tidy Data} | |
892 | +The tidyverse is named after the tidy data format. In tidy data | |
893 | +\begin{enumerate} | |
894 | +\item Each variable forms a column. | |
895 | +\item Each observation forms a row. | |
896 | +\item Each type of observational unit forms a table. | |
897 | +\end{enumerate} | |
898 | + | |
899 | +Tidy data makes it easy for an analyst or a computer to extract needed | |
900 | +variables because it provides a standard way of structuring a | |
901 | +dataset. You do not need different strategies to extract different variables. | |
902 | +The FDI flow data set was cast in a tidy format. | |
903 | + | |
904 | +Every time you have a data set with the year on the horizontal axis, | |
905 | +you are sure that the data set is messy (not tidy). | |
582 | 906 | |
583 | 907 | \end{frame} |
584 | 908 | |
585 | 909 | \begin{frame}[fragile] |
586 | -\frametitle{Reasons to use the pipe operator} | |
587 | - | |
910 | +\frametitle{Tidying Messy Datasets} | |
911 | +Real data sets are often messy in every conceivable way, e.g. | |
912 | +\begin{itemize} | |
913 | +\item Column headers are values, not variable names. | |
588 | 914 | |
589 | -\begin{enumerate} | |
590 | -\item You'll structure the sequence of your data operations from left to right, as apposed to from inside and out; | |
591 | -\item You'll avoid nested function calls; | |
592 | -\item You'll minimize the need for local variables and function definitions; And | |
593 | -\item You'll make it easy to add steps anywhere in the sequence of operations. | |
594 | -\end{enumerate} | |
915 | +\item Multiple variables are stored in one column. | |
916 | + | |
917 | +\item Variables are stored in both rows and columns. | |
918 | + | |
919 | +\item Multiple types of observational units are stored in the same table. | |
920 | + | |
921 | +\item A single observational unit is stored in multiple tables. | |
922 | +\end{itemize} | |
923 | +Tidying messy data sets is in itself a large topic; we'll focus only | |
924 | +on one example in the following. | |
595 | 925 | \end{frame} |
596 | 926 | |
597 | 927 | \begin{frame}[fragile] |
598 | -\frametitle{Enters the Pipe Operator Again} | |
599 | -It is straightforward to modify the previous sequence of operations | |
600 | -<< eval=T, highlight=T>>= | |
601 | -# Compute the logarithm of `x`, return suitably | |
602 | -# lagged and iterated differences, | |
603 | -# compute the mean | |
604 | -# and round the result with two digits | |
928 | +\frametitle{Column headers are values, not variable names} | |
929 | +This is one of the most common cases. See for instance some data about | |
930 | +income and religion in the US | |
931 | +<< highlight=T, eval=TRUE,message=F >>= | |
605 | 932 | library(tidyverse) |
606 | -x %>% log() %>% | |
607 | - diff() %>% | |
608 | - mean() %>% | |
609 | - round(2) | |
933 | +pew <-read_csv("income_religion.csv") | |
610 | 934 | @ |
935 | +\begin{table}[ht] | |
936 | +\centering | |
937 | +\scalebox{0.7}{ | |
938 | +\begin{tabular}{lrrrrrr} | |
939 | + \hline | |
940 | +religion & $<$\$10k & \$10-20k & \$20-30k & \$30-40k & \$40-50k & \$$>$50k \\ | |
941 | + \hline | |
942 | +Agnostic & 27 & 34 & 60 & 81 & 76 & 137 \\ | |
943 | + Atheist & 12 & 27 & 37 & 52 & 35 & 70 \\ | |
944 | + Buddhist & 27 & 21 & 30 & 34 & 33 & 58 \\ | |
945 | + Catholic & 418 & 617 & 732 & 670 & 638 & 1116 \\ | |
946 | + Don’t know & 15 & 14 & 15 & 11 & 10 & 35 \\ | |
947 | + Evangelical & 575 & 869 & 1064 & 982 & 881 & 1486 \\ | |
948 | + Hindu & 1 & 9 & 7 & 9 & 11 & 34 \\ | |
949 | + Historically black & 228 & 244 & 236 & 238 & 197 & 223 \\ | |
950 | + Jehovah's withnesses & 20 & 27 & 24 & 24 & 21 & 30 \\ | |
951 | + Jewish & 19 & 19 & 25 & 25 & 30 & 95 \\ | |
952 | + \hline | |
953 | +\end{tabular} | |
954 | +} | |
955 | +\end{table} | |
956 | + | |
957 | +This dataset has three variables, religion, income class and frequency. | |
958 | +Religion and income class are non-variable columns, whereas the | |
959 | +frequency is the only value. | |
611 | 960 | \end{frame} |
612 | 961 | |
962 | +\begin{frame}[fragile] | |
963 | +\frametitle{Tidying the Data Set 1/} | |
964 | +To tidy the pew data set, we need to \underline{gather} the non-variable columns into a two-column key-value pair. | |
613 | 965 | |
966 | +\end{frame} | |
614 | 967 | |
615 | 968 | \begin{frame}[fragile] |
616 | 969 | \frametitle{Linear Models in R} |