| tutorial-id |
none |
data-import |
| name |
question |
Jacob Khaykin |
| email |
question |
jacobkhaykin27@solonschools.net |
| reading-data-from-a-file-1 |
question |
Documentation for package ‘readr’ version 2.1.5 |
| reading-data-from-a-file-2 |
exercise |
read_csv(file = "data/students.csv") |
| reading-data-from-a-file-3 |
exercise |
students <- read_csv(file = "data/students.csv") |
| reading-data-from-a-file-4 |
exercise |
students |
| reading-data-from-a-file-5 |
exercise |
students <- read_csv(file = "data/students.csv",
na = c("N/A", "")) |
| reading-data-from-a-file-6 |
exercise |
students |>
rename(student_id = "Student ID") |
| reading-data-from-a-file-7 |
exercise |
library(janitor) |
| reading-data-from-a-file-8 |
exercise |
students |>
clean_names() |
| reading-data-from-a-file-9 |
exercise |
students |>
clean_names() |>
mutate(meal_plan = factor(meal_plan)) |
| reading-data-from-a-file-10 |
exercise |
students |>
clean_names() |>
mutate(
meal_plan = factor(meal_plan),
age = if_else(age == "five", "5", age)
) |
| reading-data-from-a-file-11 |
exercise |
students |>
clean_names() |>
mutate(
meal_plan = factor(meal_plan),
age = if_else(age == "five", "5", age),
age = parse_number(age)
) |
| reading-data-from-a-file-12 |
exercise |
read_csv(file = "data/test_1.csv") |
| reading-data-from-a-file-13 |
exercise |
read_csv(file = "data/test_1.csv",
show_col_types = FALSE) |
| reading-data-from-a-file-14 |
exercise |
read_csv(file = "data/test_2.csv",
skip = 2) |
| reading-data-from-a-file-15 |
exercise |
read_csv(file = "data/test_3.csv",
col_names = FALSE) |
| reading-data-from-a-file-16 |
exercise |
read_csv(file = "data/test_3.csv",
col_names = c("a", "b", "c")) |
| reading-data-from-a-file-17 |
exercise |
read_csv("data/test_3.csv",
col_names = c("a", "b", "c"),
col_types = cols(a = col_double(),
b = col_double(),
c = col_double())) |
| reading-data-from-a-file-18 |
exercise |
read_csv(file = "data/test_5.csv",
na = ".") |
| reading-data-from-a-file-19 |
exercise |
read_csv(file = "data/test_6.csv",
comment = "#") |
| reading-data-from-a-file-20 |
exercise |
col_types = cols(grade = col_integer(),
student = col_character()) |
| reading-data-from-a-file-21 |
exercise |
read_csv(file = "data/test_bad_names.csv",
name_repair = "universal") |
| reading-data-from-a-file-22 |
exercise |
read_csv(file = "data/test_bad_names.csv") |>
clean_names() |
| reading-data-from-a-file-23 |
exercise |
read_csv(file = "data/test_bad_names.csv",
name_repair = janitor::make_clean_names) |
| reading-data-from-a-file-24 |
exercise |
read_delim(file = "data/delim_1.txt") |
| reading-data-from-a-file-25 |
exercise |
col_types = cols(date = col_date(format = ""),
population = col_integer(),
town = col_character()) |
| controlling-column-types-1 |
exercise |
read_csv("
a, b, c
1, 2, 3") |
| controlling-column-types-2 |
exercise |
read_csv("
logical,numeric,date,string
TRUE,1,2021-01-15,abc
false,4.5,2021-02-15,def
T,Inf,2021-02-16,ghi
") |
| controlling-column-types-3 |
exercise |
simple_csv <- "
x
10
.
20
30"
read_csv(simple_csv) |
| controlling-column-types-4 |
exercise |
read_csv(
simple_csv,
col_types = list(x = col_double())
) |
| controlling-column-types-5 |
exercise |
df <- read_csv(
simple_csv,
col_types = list(x = col_double())
)
problems(df) |
| controlling-column-types-6 |
exercise |
read_csv(simple_csv, na = ".") |
| controlling-column-types-7 |
exercise |
read_csv(
another_csv,
col_types = cols(.default = col_character())
) |
| controlling-column-types-8 |
exercise |
read_csv(
another_csv,
col_types = cols_only(x = col_character())
) |
| controlling-column-types-9 |
exercise |
read_csv("data/ex_2.csv") |
| controlling-column-types-10 |
exercise |
read_csv("ex_2.csv",
col_types = cols(.default = col_character())
) |
| controlling-column-types-11 |
exercise |
read_csv("ex_2.csv",
col_types = cols(.default = col_character())
)|>
mutate(a = parse_integer(a)) |
| controlling-column-types-12 |
exercise |
read_csv("ex_2.csv",
col_types = cols(.default = col_character())
)|>
mutate(a = parse_integer(a))|>
mutate(b = parse_date(b, format = "%Y%M%D")) |
| controlling-column-types-13 |
exercise |
read_csv("data/ex_3.csv") |
| controlling-column-types-14 |
exercise |
read_csv("data/ex_3.csv") |>
mutate(x = parse_date(x, "%d %B %Y")) |
| controlling-column-types-15 |
exercise |
read_csv("data/ex_3.csv") |>
mutate(x = parse_date(x, "%d %B %Y"))|>
mutate(z = parse_number(z)) |
| reading-data-from-multiple-fil-1 |
exercise |
list.files("data") |
| reading-data-from-multiple-fil-2 |
exercise |
list.files("data", pattern = "similar") |
| reading-data-from-multiple-fil-3 |
exercise |
list.files("data", pattern = "similar", full.names = TRUE) |
| reading-data-from-multiple-fil-4 |
exercise |
list.files("data",
pattern = "similar",
full.names = TRUE) |>
read_csv() |
| reading-data-from-multiple-fil-5 |
exercise |
list.files("data",
pattern = "similar",
full.names = TRUE) |>
read_csv(na = ".") |
| reading-data-from-multiple-fil-6 |
exercise |
list.files("data",
pattern = "similar",
full.names = TRUE) |>
read_csv(na = ".", show_col_types = FALSE) |
| reading-data-from-multiple-fil-7 |
exercise |
list.files(path = "data",
pattern = "sales") |
| reading-data-from-multiple-fil-8 |
exercise |
list.files(path = "data",
pattern = 'sales',
full.names = TRUE) |>
read.csv() |
| reading-data-from-multiple-fil-9 |
exercise |
list.files(path = "data",
pattern = 'sales',
full.names = TRUE) |>
read.csv(id = "file") |
| writing-to-a-file-1 |
exercise |
students2 <- students |>
clean_names() |>
mutate(
meal_plan = factor(meal_plan),
age = if_else(age == "five", "5", age),
age = parse_number(age)
)
students2 |
| writing-to-a-file-2 |
exercise |
students2 |
| writing-to-a-file-3 |
exercise |
write_csv(x = students2,
file = "data/students2.csv") |
| writing-to-a-file-4 |
exercise |
read_csv("data/students2.csv") |
| writing-to-a-file-5 |
exercise |
iris_p <- iris |>
ggplot(aes(x = Sepal.Length, y = Sepal.Width)) +
geom_jitter() +
labs(title = "Sepal Dimensions of Various Species of Iris",
x = "Sepal Length",
y = "Sepal Width") |
| writing-to-a-file-6 |
exercise |
list.files("data") |
| writing-to-a-file-7 |
exercise |
read_rds(file = "data/test_1.rds") |
| writing-to-a-file-8 |
exercise |
write_rds(mtcars, "data/test_2.rds") |
| writing-to-a-file-9 |
exercise |
list.files("data") |
| writing-to-a-file-10 |
exercise |
read_rds(file = "data/test_2.rds") |
| writing-to-a-file-11 |
question |
What is Apache Arrow?
Apache Arrow is a multi-language toolbox for building high performance applications that process and transport large data sets. It is designed to both improve the performance of analytical algorithms and the efficiency of moving data from one system (or programming language to another).
A critical component of Apache Arrow is its in-memory columnar format, a standardized, language-agnostic specification for representing structured, table-like datasets in-memory. This data format has a rich data type system (included nested and user-defined data types) designed to support the needs of analytic database systems, data frame libraries, and more.
The project also contains implementations of the Arrow columnar format in many languages, along with utilities for reading and writing it to many common storage formats. These official libraries enable third-party projects to work with Arrow data without having to implement the Arrow columnar format themselves. For those that want to implement a small subset of the format, the Arrow project contains some tools, such as a C data interface, to assist with interoperability with the official Arrow libraries.
The Arrow libraries contain many software components that assist with systems problems related to getting data in and out of remote storage systems and moving Arrow-formatted data over network interfaces. Some of these components can be used even in scenarios where the columnar format is not used at all.
Lastly, alongside software that helps with data access and IO-related issues, there are libraries of algorithms for performing analytical operations or queries against Arrow datasets. |
| data-entry-1 |
exercise |
tibble(
x = c(1, 2, 5),
y = c("h", "m", "g"),
z = c(0.08, 0.83, 0.60)
) |
| data-entry-2 |
exercise |
tribble(
~x, ~y, ~z,
1, "h", 0.08,
2, "m", 0.83,
5, "g", 0.60
) |
| minutes |
question |
75 |