| tutorial-id |
none |
data-import |
| name |
question |
Ansh Patel |
| email |
question |
anshbrunch@gmail.com |
| reading-data-from-a-file-1 |
question |
Documentation for package ‘readr’ version 2.1.5 |
| reading-data-from-a-file-2 |
exercise |
read_csv(file = "data/students.csv") |
| reading-data-from-a-file-3 |
exercise |
students <- read_csv(file = "data/students.csv") |
| reading-data-from-a-file-4 |
exercise |
students |
| reading-data-from-a-file-5 |
exercise |
students <- read_csv(file = "data/students.csv", na = c("N/A", "")) |
| reading-data-from-a-file-6 |
exercise |
students |>
rename(student_id = "Student ID") |
| reading-data-from-a-file-7 |
exercise |
library(janitor) |
| reading-data-from-a-file-8 |
exercise |
students |>
clean_names() |
| reading-data-from-a-file-9 |
exercise |
students |>
clean_names() |>
mutate(meal_plan = factor(meal_plan)) |
| reading-data-from-a-file-10 |
exercise |
students |>
clean_names() |>
mutate(meal_plan = factor(meal_plan), age = if_else(age == "five", "5", age)) |
| reading-data-from-a-file-11 |
exercise |
students |>
clean_names() |>
mutate(meal_plan = factor(meal_plan), age = if_else(age == "five", "5", age), age = parse_number(age)) |
| reading-data-from-a-file-13 |
exercise |
read_csv(file = "data/test_1.csv", show_col_types = FALSE) |
| reading-data-from-a-file-14 |
exercise |
read_csv(file = "data/test_2.csv",
skip = 2) |
| reading-data-from-a-file-15 |
exercise |
read_csv(file = "data/test_3.csv",
col_names = FALSE) |
| reading-data-from-a-file-16 |
exercise |
read_csv(file = "data/test_3.csv",
col_names = c("a", "b", "c")) |
| reading-data-from-a-file-17 |
exercise |
read_csv("data/test_3.csv",
col_names = c("a", "b", "c"),
col_types = cols(a = col_double(),
b = col_double(),
c = col_double())) |
| reading-data-from-a-file-18 |
exercise |
read_csv("data/test_5.csv", na = ".") |
| reading-data-from-a-file-19 |
exercise |
read_csv("data/test_6.csv", comment = "#") |
| reading-data-from-a-file-20 |
exercise |
read_csv("data/test_7.csv", col_types = cols(grade = col_integer(),
student = col_character())) |
| reading-data-from-a-file-21 |
exercise |
read_csv("data/test_bad_names.csv", name_repair = "universal") |
| reading-data-from-a-file-22 |
exercise |
read_csv("data/test_bad_names.csv") |> clean_names() |
| reading-data-from-a-file-23 |
exercise |
read_csv("data/test_bad_names.csv", name_repair = janitor::make_clean_names) |
| reading-data-from-a-file-24 |
exercise |
read_delim(file = "data/test_bad_names.csv", name_repair = janitor::make_clean_names) |
| reading-data-from-a-file-25 |
exercise |
read_delim("data/delim_2.txt",
delim = "|",
col_types = cols(date = col_date(format = ""),
population = col_integer(),
town = col_character())) |
| controlling-column-types-1 |
exercise |
read_csv("
a, b, c
1, 2, 3") |
| controlling-column-types-2 |
exercise |
read_csv("
logical,numeric,date,string
TRUE,1,2021-01-15,abc
false,4.5,2021-02-15,def
T,Inf,2021-02-16,ghi
") |
| controlling-column-types-3 |
exercise |
simple_csv <- "
x
10
.
20
30"
read_csv(simple_csv) |
| controlling-column-types-5 |
exercise |
df <- read_csv(
simple_csv,
col_types = list(x = col_double())
)
problems(df) |
| controlling-column-types-6 |
exercise |
read_csv(simple_csv, na = ".") |
| controlling-column-types-7 |
exercise |
read_csv(
another_csv,
col_types = cols(.default = col_character())) |
| controlling-column-types-8 |
exercise |
read_csv(
another_csv,
col_types = cols_only(x = col_character())) |
| controlling-column-types-9 |
exercise |
read_csv("data/ex_2.csv") |
| controlling-column-types-10 |
exercise |
read_csv("data/ex_2.csv",
col_types = cols(.default = col_character())
) |
| controlling-column-types-11 |
exercise |
read_csv("data/ex_2.csv",
col_types = cols(.default = col_character())) |> mutate(a = parse_integer(a)) |
| controlling-column-types-12 |
exercise |
read_csv("data/ex_2.csv",
col_types = cols(.default = col_character())) |> mutate(a = parse_integer(a)) |> mutate(b = parse_integer(b), format = "%Y%M%D") |
| controlling-column-types-13 |
exercise |
read_csv("data/ex_3.csv") |
| controlling-column-types-14 |
exercise |
read_csv("data/ex_3.csv") |> mutate(x = parse_date(x, "%d %B %Y")) |
| controlling-column-types-15 |
exercise |
read_csv("data/ex_3.csv") |> mutate(x = parse_date(x, "%d %B %Y")) |> mutate(z = parse_number(z)) |
| reading-data-from-multiple-fil-1 |
exercise |
list.files("data") |
| reading-data-from-multiple-fil-2 |
exercise |
list.files("data", pattern = "similar") |
| reading-data-from-multiple-fil-3 |
exercise |
list.files("data", pattern = "similar", full.names = TRUE) |
| reading-data-from-multiple-fil-4 |
exercise |
list.files("data", pattern = "similar", full.names = TRUE) |> read_csv() |
| reading-data-from-multiple-fil-5 |
exercise |
list.files("data", pattern = "similar", full.names = TRUE) |> read_csv(na = ".") |
| reading-data-from-multiple-fil-6 |
exercise |
list.files("data", pattern = "similar", full.names = TRUE) |> read_csv(na = ".", show_col_types = FALSE) |
| reading-data-from-multiple-fil-7 |
exercise |
list.files(path = "data",
pattern = "sales") |
| reading-data-from-multiple-fil-8 |
exercise |
list.files(path = "data",
pattern = "sales", full.names = TRUE) |> read_csv() |
| reading-data-from-multiple-fil-9 |
exercise |
list.files(path = "data",
pattern = "sales", full.names = TRUE) |> read_csv(id = "file") |
| writing-to-a-file-2 |
exercise |
students2 |
| writing-to-a-file-3 |
exercise |
write_csv(x = students2,
file = "data/students2.csv") |
| writing-to-a-file-4 |
exercise |
read_csv("data/students2.csv") |
| writing-to-a-file-5 |
exercise |
iris_p <- iris |>
ggplot(aes(x = Sepal.Length, y = Sepal.Width)) +
geom_jitter() +
labs(title = "Sepal Dimensions of Various Species of Iris",
x = "Sepal Length",
y = "Sepal Width") |
| writing-to-a-file-6 |
exercise |
list.files("data") |
| writing-to-a-file-7 |
exercise |
read_rds(file = "data/test_1.rds") |
| writing-to-a-file-8 |
exercise |
write_rds(mtcars, "data/test_2.rds") |
| writing-to-a-file-9 |
exercise |
list.files("data") |
| writing-to-a-file-11 |
question |
Why define a standard for columnar in-memory?
Traditionally, data processing engine developers have created custom data structures to represent datasets in-memory while they are being processed. Given the “custom” nature of these data structures, they must also develop serialization interfaces to convert between these data structures and different file formats, network wire protocols, database clients, and other data transport interface. The net result of this is an incredible amount of waste both in developer time and in CPU cycles spend serializing data from one format to another.
The rationale for Arrow’s in-memory columnar data format is to provide an out-of-the-box solution to several interrelated problems:
A general purpose tabular data representation that is highly efficient to process on modern hardware while also being suitable for a wide spectrum of use cases. We believe that fewer and fewer systems will create their own data structures and simply use Arrow.
Supports both random access and streaming / scan-based workloads.
A standardized memory format facilitates reuse of libraries of algorithms. When custom in-memory data formats are used, common algorithms must often be rewritten to target those custom data formats.
Systems that both use or support Arrow can transfer data between them at little-to-no cost. This results in a radical reduction in the amount of serialization overhead in analytical workloads that can often represent 80-90% of computing costs.
The language-agnostic design of the Arrow format enables systems written in different programming languages (even running on the JVM) to communicate datasets without serialization overhead. For example, a Java application can call a C or C++ algorithm on data that originated in the JVM. |
| data-entry-1 |
exercise |
tibble(
x = c(1, 2, 5),
y = c("h", "m", "g"),
z = c(0.08, 0.83, 0.60)
) |
| data-entry-2 |
exercise |
tribble(
~x, ~y, ~z,
1, "h", 0.08,
2, "m", 0.83,
5, "g", 0.60
) |
| minutes |
question |
60 |