###Cleaning and analyzing Mozambique data### library(tidyverse) library(readxl) library(viridis) ###Reading in the data### Moz <- read_excel("Exercise_2.xlsx") str(Moz) ###NA in Age column### ggplot(Moz, aes(x = Age)) + #we see a bigger bar at the end meaning most likely 99yo is a fake value geom_bar() ggplot(Moz, aes(x = quality, y = Age, group = quality)) + #just a second way at looking at the same data geom_boxplot()+ geom_jitter() ###Filter to 704### #here i filtered to 711. You can just set filter(final_dataset == 1) and you would get 704 Moz <- Moz %>% filter(Coverage > 30, quality < 3, WHO_Treatment != "No information", Res_Summary != "S") ###Changing the "-" in antibiotics columns### mutate(RMP = na_if(RMP, "-")) %>% #for every column na_if() replaces the specified value with NA mutate(SM = na_if(SM, "-")) %>% mutate(EMB = na_if(EMB, "-")) %>% mutate(PZA = na_if(PZA, "-")) %>% mutate(MFX = na_if(MFX, "-")) %>% mutate(CFZ = na_if(CFZ, "-")) %>% mutate(`ETH/PTH` = na_if(`ETH/PTH`, "-")) %>% mutate(BDQ = na_if(BDQ, "-")) ###Who gets more tb### ggplot(Moz, aes(x = Gender, fill = Gender)) + #if you use color instead of fill only the outline of the bars will be colored. try it out geom_bar() ###Which antibiotic has most resistance to### antibiotics <- Moz %>% select(SM:BDQ) # : includes all columns between the two mentioned colSums(!is.na(antibiotics)) #counting up all non NA values in each column ###Does XDR transmit the most### Moz %>% mutate(transmission = if_else(d12 == "ungrouped", "No", "Yes")) %>% group_by(Res_Summary,transmission) %>% summarise(Resistance_count = n()) %>% ungroup() %>% group_by(Res_Summary) %>% mutate(transmission_index = Resistance_count/sum(Resistance_count)) %>% filter(transmission == "Yes") ###Count two antibiotics### SM_mutations <- Moz %>% #this crates a data frame of SM mutations and counts them select(SM) %>% group_by(SM) %>% count() SM_mutations <- Moz %>% #this does the same as previous but stratifies on the lineage level as well select(SM, Lineage) %>% group_by(SM, Lineage) %>% count() BDQ_mutations <- Moz %>% #the same as first SM one select(BDQ) %>% group_by(BDQ) %>% count() BDQ_mutations <- Moz %>% #the same as second SM one select(BDQ, Lineage) %>% group_by(BDQ, Lineage) %>% count() ###Plot resistances over time### increase <- Moz %>% select(Year_of_sample_collection, RMP, BDQ) #select only columns of interest I491F <- increase %>% #create a data frame of only I491F samples and their frequency per year group_by(Year_of_sample_collection, RMP) %>% summarize(I491F_count = n()) %>% mutate(I491F_freq = round(I491F_count/sum(I491F_count), 3)) %>% filter(RMP == "rpoB I491F") BDQ_res <- increase %>% #create a data frame of only BDQ resistant samples and their frequency per year mutate(BDQ = if_else(!is.na(BDQ), "Resistant", NA)) %>% group_by(Year_of_sample_collection, BDQ) %>% summarize(BDQ_count = n()) %>% mutate(BDQ_freq = round(BDQ_count/sum(BDQ_count), 3)) %>% filter(!is.na(BDQ)) combined <- left_join(BDQ_res, I491F) #joined the previous two datsets combined_long <- pivot_longer(combined, cols = c(BDQ_freq, I491F_freq), #made the BDQ and I491F columns into a categorical one using pivot to make plotting possible names_to = "Measure", values_to = "Value") ggplot(combined_long, aes(color = Measure, group = Measure, x = Year_of_sample_collection, y = Value)) + geom_point() + #scatterplot of resistances geom_smooth(method="lm", se = F) + #lm of resistance increase over time theme_classic() + scale_y_continuous(limits = c(0, 0.25)) + #adding a fitting scale that tops at 25% labs(title = "Increase of resistance over time", x = "Year of sample collection", y = "Frequency of resistant samples")