id submission_type answer
tutorial-id none data-import
name question Abdul Rahman
email question mono.punjabian@gmail.com
reading-data-from-a-file-1 question Version: 2.1.5
reading-data-from-a-file-2 exercise students <- read_csv("data/students.csv")
reading-data-from-a-file-3 exercise students <- read_csv("data/students.csv")
reading-data-from-a-file-4 exercise print(students)
reading-data-from-a-file-5 exercise students <- read_csv("data/students.csv", na = c("N/A", ""))
reading-data-from-a-file-6 exercise students <- read_csv("data/students.csv", na = c("N/A", "")) |> rename(student_id = `Student ID`)
reading-data-from-a-file-7 exercise library(janitor)
reading-data-from-a-file-8 exercise students |> clean_names()
reading-data-from-a-file-9 exercise students <- read_csv("data/students.csv", na = c("N/A", "")) |> clean_names() |> mutate(meal_plan = factor(meal_plan))
reading-data-from-a-file-10 exercise spec (students <- read_csv("data/students.csv", na = c("N/A", "")) |> clean_names() |> mutate( meal_plan = factor(meal_plan), age = if_else(age == "five", "5", age) ))
reading-data-from-a-file-11 exercise students <- read_csv("data/students.csv", na = c("N/A", ""), show_col_types = TRUE) |> clean_names() |> mutate( meal_plan = factor(meal_plan), age = if_else(age == "five", "5", age), age = parse_number(age) )
reading-data-from-a-file-12 exercise read_csv(file = "data/test_1.csv")
reading-data-from-a-file-13 exercise read_csv(file = "data/test_1.csv", show_col_types = FALSE)
reading-data-from-a-file-14 exercise read_csv("data/test_2.csv", skip = 2, show_col_types = TRUE)
reading-data-from-a-file-15 exercise read_csv(file = "data/test_3.csv" , col_names = FALSE)
reading-data-from-a-file-16 exercise read_csv( file = "data/test_3.csv", col_names = c("a", "b", "c"), col_types = cols( a = col_double(), b = col_double(), c = col_double() ) )
reading-data-from-a-file-17 exercise read_csv( "data/test_3.csv", col_types = cols( a = col_double(), b = col_double(), c = col_double() ) )
reading-data-from-a-file-18 exercise read_csv( file = "data/test_5.csv", na = "." )
reading-data-from-a-file-19 exercise read_csv( file = "data/test_6.csv", comment = "#" )
reading-data-from-a-file-20 exercise read_csv( file = "data/test_7.csv", col_types = cols( grade = col_integer(), student = col_character() ) )
reading-data-from-a-file-21 exercise read_csv( file = "data/test_bad_names.csv", name_repair = "universal" )
reading-data-from-a-file-22 exercise read_csv("data/test_bad_names.csv") |> clean_names()
reading-data-from-a-file-23 exercise read_csv( file = "data/test_bad_names.csv", name_repair = janitor::make_clean_names )
reading-data-from-a-file-24 exercise read_delim( file = "data/delim_1.txt", delim = "|" )
reading-data-from-a-file-25 exercise read_delim( file = "data/delim_1.txt", delim = "|", col_types = cols( population = col_integer(), date = col_date() ) )
controlling-column-types-1 exercise read_csv(" a, b, c 1, 2, 3")
controlling-column-types-2 exercise read_csv(" logical,numeric,date,string TRUE,1,2021-01-15,abc false,4.5,2021-02-15,def T,Inf,2021-02-16,ghi ")
controlling-column-types-3 exercise simple_csv <- " x 10 . 20 30" read_csv(simple_csv)
controlling-column-types-4 exercise read_csv( file = simple_csv, # Replace with actual file path if different col_types = list(x = col_double()) )
controlling-column-types-5 exercise df <- read_csv( file = simple_csv, # Replace with actual file path if different col_types = list(x = col_double()) ) problems(df)
controlling-column-types-6 exercise read_csv(simple_csv, "na" = ".")
controlling-column-types-7 exercise df <- read_csv( file = another_csv, col_types = cols(.default = col_character()) )
controlling-column-types-8 exercise read_csv(another_csv, col_types = cols_only(x = col_character()))
controlling-column-types-9 exercise read_csv(file = "data/ex_2.csv")
controlling-column-types-10 exercise read_csv(file = "data/ex_2.csv", col_types = cols(.default = col_character()))
controlling-column-types-11 exercise read_csv(file = "data/ex_2.csv", col_types = cols(.default = col_character())) |> dplyr::mutate(a = readr::parse_integer(a))
controlling-column-types-12 exercise readr::read_csv(file = "data/ex_2.csv", col_types = cols(.default = col_character())) |> dplyr::mutate( a = readr::parse_integer(a), b = readr::parse_date(b, format = "%Y%M%D") )
controlling-column-types-13 exercise df <- read_csv("data/ex_3.csv") problems(df)
controlling-column-types-14 exercise readr::read_csv("data/ex_3.csv") |> dplyr::mutate(x = readr::parse_date(x, "%d %B %Y"))
controlling-column-types-15 exercise readr::read_csv("data/ex_3.csv") |> dplyr::mutate( x = readr::parse_date(x, "%d %B %Y"), z = readr::parse_number(z) )
reading-data-from-multiple-fil-1 exercise list.files("data")
reading-data-from-multiple-fil-2 exercise list.files("data", pattern = "similar")
reading-data-from-multiple-fil-3 exercise list.files("data", pattern = "similar", full.names = TRUE)
reading-data-from-multiple-fil-4 exercise list.files("data", pattern = "similar", full.names = TRUE) |> purrr::map_dfr(readr::read_csv)
reading-data-from-multiple-fil-5 exercise list.files("data", pattern = "similar", full.names = TRUE) |> purrr::map_dfr(~ readr::read_csv(.x, na = "."))
reading-data-from-multiple-fil-6 exercise list.files("data", pattern = "similar", full.names = TRUE) |> purrr::map_dfr(~ readr::read_csv(.x, na = ".", show_col_types = FALSE))
reading-data-from-multiple-fil-7 exercise list.files("data", pattern = "sales")
reading-data-from-multiple-fil-8 exercise list.files("data", pattern = "sales", full.names = TRUE) |> purrr::map_dfr(readr::read_csv)
reading-data-from-multiple-fil-9 exercise list.files("data", pattern = "sales", full.names = TRUE) |> purrr::map_dfr(readr::read_csv, id = "file")
writing-to-a-file-1 exercise students2 <- students |> clean_names() |> mutate( meal_plan = factor(meal_plan), age = if_else(age == "five", "5", age), age = parse_number(age) ) students2
writing-to-a-file-2 exercise students2 print(students2)
writing-to-a-file-3 exercise write_csv(x = students2, file = "data/students2.csv")
writing-to-a-file-4 exercise read_csv(file = "data/students2.csv")
writing-to-a-file-5 exercise iris_p <- iris |> ggplot(aes(x = Sepal.Length, y = Sepal.Width)) + geom_jitter() + labs(title = "Sepal Dimensions of Various Species of Iris", x = "Sepal Length", y = "Sepal Width")
writing-to-a-file-6 exercise list.files("data")
writing-to-a-file-7 exercise read_rds(file = "data/test_1.rds")
writing-to-a-file-8 exercise write_rds(x = mtcars, file = "data/test_2.rds")
writing-to-a-file-9 exercise list.files("data")
writing-to-a-file-10 exercise read_rds(file = "data/test_2.rds")
writing-to-a-file-11 question Why define a standard for columnar in-memory? Traditionally, data processing engine developers have created custom data structures to represent datasets in-memory while they are being processed. Given the “custom” nature of these data structures, they must also develop serialization interfaces to convert between these data structures and different file formats, network wire protocols, database clients, and other data transport interface. The net result of this is an incredible amount of waste both in developer time and in CPU cycles spend serializing data from one format to another. The rationale for Arrow’s in-memory columnar data format is to provide an out-of-the-box solution to several interrelated problems: A general purpose tabular data representation that is highly efficient to process on modern hardware while also being suitable for a wide spectrum of use cases. We believe that fewer and fewer systems will create their own data structures and simply use Arrow. Supports both random access and streaming / scan-based workloads. A standardized memory format facilitates reuse of libraries of algorithms. When custom in-memory data formats are used, common algorithms must often be rewritten to target those custom data formats. Systems that both use or support Arrow can transfer data between them at little-to-no cost. This results in a radical reduction in the amount of serialization overhead in analytical workloads that can often represent 80-90% of computing costs. The language-agnostic design of the Arrow format enables systems written in different programming languages (even running on the JVM) to communicate datasets without serialization overhead. For example, a Java application can call a C or C++ algorithm on data that originated in the JVM.
data-entry-1 exercise # Create the tibble my_tibble <- tibble( x = c(1, 2, 5), y = c("h", "m", "g"), z = c(0.08, 0.83, 0.60) ) # Print the tibble print(my_tibble)
data-entry-2 exercise # Load the tibble package library(tibble) # Create the tibble using tribble() my_tibble <- tribble( ~x, ~y, ~z, 1, "h", 0.08, 2, "m", 0.83, 5, "g", 0.60 ) # Print the tibble print(my_tibble)
minutes question 130