###Lesson 3 code###

#Chunk 1 opening and exploring the data#
install.packages("tidyverse")
library(tidyverse)
library(readxl)
Moz <- read_excel("Mozambique.xlsx", na = "NA")
str(Moz)
select(Moz, Lineage, Age, Province) #Select columns
select(Moz, -SampleID)              #Negative selection, all except
filter(Moz, final_dataset == 1)     #filter based on a value in a column
filter(Moz, final_dataset == 1, Res_Summary == "MDR") #filter on multiple columns
mutate(Moz, adult = if_else(Age >= 18, "Yes", "No"))  #create a new column with values based on another column
group_by(Moz, Lineage)              #change the internal structure of the tibble 

#Chunk 2 using the %>% operator#
Moz %>%                  #Use the Moz dataset
  group_by(Lineage) %>%  #Make internal groupsa based on the Lineage column
  summarize(avg_age = mean(Age, na.rm = TRUE)) #Create the average age column, based on lineage
count(Moz, Lineage)      #Count based on lineage column
Moz %>% 
  filter(final_dataset == 1) %>% 
  mutate(transmission = if_else(d12 == "ungrouped", "No", "Yes")) %>% 
  group_by(Lineage,transmission) %>% 
  summarise(lineage_count = n()) %>% #count based on two groupings, lineage and transmission
  ungroup() %>%          #remove the internal group styructure
  group_by(Lineage) %>% 
  mutate(transmission_index = lineage_count/sum(lineage_count)) %>% 
  filter(transmission == "Yes")

#Chunk 3 joining tibbles#
groups <- Moz %>%     #creating a new table for illustration purposes
  select(SampleID,d12)
Moz <- Moz %>%        #removing d12 column from Mozambique dataset
  select(-d12)
str(Moz)
Moz <- left_join(Moz,groups) #adding groups to Moz

#Chunk 4 Transforming data using pivots#
data2 <- read.csv("data2.txt", header = TRUE, sep = ";")
data2 <- data2 %>%  #transform data2 so that height and weight measurements go in one column
  pivot_longer(cols = c(height, weight), names_to ="Measurement", values_to = "Value")
data2 <- data2 %>%  #transform data2 back to original form
  pivot_wider(names_from = "Measurement", values_from = "Value")
data2 <- read.csv("data2.txt", header = TRUE, sep = ";")
data2 <- data2 %>% #add a column with unique identifiers
  add_column(name = c("Viola","Ivan", "Christian"))
data2 <- data2 %>%
  pivot_longer(cols = c(height, weight), names_to ="Measurement", values_to = "Value")
data2 <- data2 %>% 
  pivot_wider(names_from = "Measurement", values_from = "Value")


#Chunk 5 ggplot2#
ggplot(Moz, aes(x = Age, y = Weight)) +
  geom_point()

ggplot(Moz, aes(x = Age, y = Weight)) +
  geom_point(position = "jitter")

ggplot(Moz, aes(x = Age, y = Weight)) +
  geom_point(position = "jitter") +
  scale_x_continuous(limits = c(10, 60)) +
  scale_y_continuous(limits = c(20, 80))

ggplot(Moz, aes(x = Age, y = Weight,  shape = Lineage)) +
  geom_point()

ggplot(Moz, aes(x = Age, y = Weight, color = Gender)) +
  geom_point(size = 3, shape = 19) +
  labs(
    title = "Scatter Plot: Age vs Weight",
    x = "Weight",
    y = "Age"
  )

ggplot(Moz, aes(x = Age, y = Weight, color = Gender, alpha = quality)) +
    geom_point(size = 3, shape = 19)

ggplot(Moz, aes(x = Age, y = Weight)) +
  geom_point()+
  geom_smooth()

ggplot(Moz, aes(x = Age, y = Weight, color = Gender)) +
  geom_point()+
  geom_smooth()

ggplot(Moz, aes(x = Year_of_sample_collection, fill = Res_Summary)) +
  geom_bar()

library(viridis)
ggplot(Moz, aes(x = Year_of_sample_collection, fill = Res_Summary)) +
  geom_bar() +
  scale_fill_viridis(discrete = T)

ggplot(Moz, aes(x = Year_of_sample_collection, fill = Res_Summary)) +
  geom_bar(position = "fill") +
  scale_fill_viridis(discrete = T)

ggplot(Moz, aes(x = Year_of_sample_collection, fill = Res_Summary)) +
  geom_bar(position = "dodge") +
  scale_fill_viridis(discrete = T) +
  theme_classic()

ggplot(Moz, aes(x = Lineage, y = Res_Summary)) +
  geom_point(position = "jitter", aes(color = Year_of_sample_collection)) +
  facet_wrap(~ Province) +
  scale_color_viridis(discrete = F, option = "B")