---
title: "Day 1 - Advanced AMR Data Visualization in R"
output: html_document
---

```{r setup, include=FALSE}
# Load required libraries
library(tidyverse)
library(janitor)
library(pheatmap)
library(patchwork)
library(ComplexHeatmap) # Optional for advanced heatmaps
```

## Load and Clean the Phenotype Dataset

```{r}
# Load amr_phenotype_eu.csv
phenotype_raw <- read.csv("amr_phenotype_eu.csv")
phenotype_raw #lets call it "raw" for now - uncleaned dataset
```

```{r}
#Clean and standardize the phenotype dataset

phenotype <- phenotype_raw %>%  # Start from the raw dataset
  clean_names()  # Convert column names to lowercase and snake_case for consistency (from janitor package)

phenotype <- phenotype %>%
  mutate(
    # Capitalize the first letter of each word in species and country (e.g., "france" → "France")
    species = str_to_title(species),
    country = str_to_title(country),
    
    # Convert antibiotic test result values to uppercase ("r" → "R", "s" → "S")
    # The tilde (~) introduces an anonymous function, and the dot (.) refers to each value in the columns
    across(c(ciprofloxacin:colistin), ~str_to_upper(.))
  )

# View the cleaned data
phenotype  # Use head(phenotype) for large datasets

```

##Mini-Exercise: Explore Your Cleaned Dataset
Try the following tasks using dplyr functions:

Count how many isolates are from each country using count(country)
Use glimpse() to explore the structure of the dataset
Bonus: Can you filter for all E. coli isolates from France in 2021?

```{r}
# Count number of isolates per country
phenotype %>%
  count(country)

# Get a quick overview of the dataset structure
glimpse(phenotype) 
# glimpse() shows variables and data types in a transposed layout,
# making it easy to see every column at a glance.

# BONUS: Filter for E. coli isolates from France in 2021
phenotype %>%
  filter(species == "E. Coli", country == "France", year == 2021)

```

## Convert to Long Format and handle missing values

```{r}
#Reshape the phenotype data to long format and handle missing values

phenotype_long <- phenotype %>%
  # Convert from wide to long format: one row per antibiotic result
  pivot_longer(
    cols = ciprofloxacin:colistin,          # All antibiotic test columns
    names_to = "antibiotic",                # New column for antibiotic names
    values_to = "resistance"                # New column for resistance results (S, R, or NA)
  ) %>%
  mutate(
    # Capitalize antibiotic names for clean plotting
    antibiotic = str_to_title(antibiotic),
    
    # Convert resistance column to a factor with levels in logical order
    resistance = factor(resistance, levels = c("S", "R"))
  )

# Option 1: Keep NA values for analysis or visualizing missing data
phenotype_long

# Option 2: Remove rows with missing resistance data (uncomment to use)
phenotype_long <- phenotype_long %>%
filter(!is.na(resistance))

#View the reshaped dataset
phenotype_long

```

##Mini-Exercise: 
Now that your dataset is in long format, try filtering for resistant results only and count how many resistant isolates there are per species.

```{r}
# Filter for resistant isolates only
resistant_only <- phenotype_long %>%
  filter(resistance == "R")

# Count resistant isolates by species
resistant_only %>%
  count(species)

```

## Resistance Proportion by Country and Year
- Now, we group the data by country, year, antibiotic, and resistance to count how many isolates fall into each resistance category.
- Then, we calculate the resistance proportion within each group (e.g., 1 resistant out of 2 total = 0.5).
- Because our dataset currently has only 1 or 2 isolates per group, the calculated proportions are limited to 0.5 or 1. With larger datasets, you'd see more variation (e.g., 0.33, 0.75, etc.).

```{r}
# Summarize resistance proportions
res_prop <- phenotype_long %>%
  # Count the number of isolates by country, year, antibiotic, and resistance status ("S" or "R")
  group_by(country, year, antibiotic, resistance) %>%
  summarise(count = n(), .groups = 'drop') %>%  # Drop groups to prevent grouped behavior downstream

  # Re-group by country, year, and antibiotic (excluding resistance category)
  group_by(country, year, antibiotic) %>%
  # Calculate the proportion of each resistance type (e.g., R or S) within each group
  mutate(proportion = count / sum(count))

# Preview resistant proportions only
res_prop %>%
  filter(resistance == "R")
res_prop
```
## Resistance Proportion by Country and Year with larger dataset 
```{r}
# Load and clean the dataset
phenotype_large_raw <- read_csv("amr_phenotype_larger.csv")
phenotype_large <- phenotype_large_raw %>%
  clean_names() %>%
  mutate(
    species = str_to_title(species),
    country = str_to_title(country),
    across(c(ciprofloxacin:colistin), ~str_to_upper(.))
  )

# Reshape to long format
phenotype_large_long <- phenotype_large %>%
  pivot_longer(
    cols = ciprofloxacin:colistin,
    names_to = "antibiotic",
    values_to = "resistance"
  ) %>%
  mutate(
    resistance = factor(resistance, levels = c("S", "R")),
    antibiotic = str_to_title(antibiotic)
  )

# Summarize resistance proportions
res_prop_large <- phenotype_large_long %>%
  group_by(country, year, antibiotic, resistance) %>%
  summarise(count = n(), .groups = 'drop') %>%
  group_by(country, year, antibiotic) %>%
  mutate(proportion = count / sum(count))

# Preview resistant proportions only
res_prop_large %>% filter(resistance == "R")
```
That's a standard informational message from the readr::read_csv() function. It just tells you:

The dataset has 300 rows and 10 columns.
It has guessed the column types:
9 are character (chr): e.g. Species, Country, Antibiotics, etc.
1 is numeric/double (dbl): Year.
It suggests you can:
Use spec() to see the full column type specification.
Add show_col_types = FALSE to silence this message.

## Ciprofloxacin Resistance by Country (Styled Plot)

```{r}
cipro_plot <- phenotype_long %>%
  filter(antibiotic == "Ciprofloxacin") %>% #add !is.na(resistance) to remove NAs 
  ggplot(aes(x = country, fill = resistance)) +
  geom_bar(position = "fill") +
  labs(title = "Ciprofloxacin Resistance by Country",
       y = "Proportion of Isolates", x = "Country") +
  theme_classic() +
  scale_fill_manual(values = c("S" = "steelblue", "R" = "firebrick"))

cipro_plot
```
```{r}
# Ciprofloxacin resistance plot from the large dataset
cipro_plot_large <- phenotype_large_long %>%
  filter(antibiotic == "Ciprofloxacin", !is.na(resistance)) %>%
  ggplot(aes(x = country, fill = resistance)) +
  geom_bar(position = "fill") +
  labs(title = "Ciprofloxacin Resistance by Country (Large Dataset)",
       y = "Proportion of Isolates", x = "Country") +
  theme_classic() +
  scale_fill_manual(values = c("S" = "steelblue", "R" = "firebrick"))

cipro_plot_large
```
##Mini-Exercise:
Update the large dataset plot to explore resistance to a different antibiotic.

Tasks:

-Change the antibiotic filter to another drug (e.g., "Ampicillin" or "Colistin").
-Try switching to a different ggplot2 theme such as theme_minimal() or theme_bw().
-(Optional) Customize the plot title to match your new antibiotic.

```{r}
# Example: Visualizing resistance to Ampicillin
amp_plot <- phenotype_large_long %>%
  filter(antibiotic == "Ampicillin", !is.na(resistance)) %>%
  ggplot(aes(x = country, fill = resistance)) +
  geom_bar(position = "fill") +
  labs(title = "Ampicillin Resistance by Country",
       y = "Proportion of Isolates", x = "Country") +
  theme_minimal() +
  scale_fill_manual(values = c("S" = "steelblue", "R" = "firebrick"))

amp_plot

```


## Faceted Plot: Resistance by Antibiotic and Year

```{r}
# Create a faceted bar plot to visualize resistance trends by year (excluding missing resistance data)

facet_plot <- phenotype_large_long %>%
  filter(!is.na(resistance)) %>%                    # Remove rows with missing resistance values
  ggplot(aes(x = antibiotic, fill = resistance)) +  # X-axis: antibiotics; fill by resistance status (S or R)
  geom_bar(position = "fill") +                     # Use "fill" to show proportions within each antibiotic group
  facet_wrap(~year) +                               # Create a separate panel (facet) for each year
  labs(
    title = "Resistance by Antibiotic and Year",    # Main plot title
    y = "Proportion",                               # Y-axis label showing proportions (stacked bars)
    x = "Antibiotic"                                # X-axis label
  ) +
  theme_bw() +                                      # Use a clean black-and-white theme
  scale_fill_manual(                                # Set custom fill colors for "S" and "R"
    values = c("S" = "seagreen", "R" = "tomato")
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)  # Rotate x-axis labels for readability
  )

facet_plot


```

Mini-Exercise: Facet the large dataset by both species and year

```{r}
#Mini-Exercise: Facet the large dataset by both species and year

facet_grid_plot <- phenotype_large_long %>%
  filter(!is.na(resistance)) %>%                         # Exclude missing resistance values
  ggplot(aes(x = antibiotic, fill = resistance)) +       # Plot resistance by antibiotic
  geom_bar(position = "fill") +                          # Use fill to show proportions
  facet_grid(species ~ year) +                           # Facet by species (rows) and year (columns)
  labs(
    title = "Resistance by Antibiotic, Species, and Year",
    y = "Proportion",
    x = "Antibiotic"
  ) +
  theme_light() +                                        # Light theme for contrast
  scale_fill_manual(values = c("S" = "seagreen", "R" = "tomato")) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

facet_grid_plot


```
As you can see here, the faceted bar plot is quite detailed — but with five species, three years, and four antibiotics, it becomes visually overwhelming. It's difficult to spot patterns or outliers at a glance.

This is where a heatmap becomes a powerful tool — it lets us condense many values into a single visual matrix where color intensity communicates resistance proportions. This makes it much easier to compare across species, antibiotics, and years at once."


```{r}
# We start by ensuring that all isolate IDs are unique.
# This is important because we'll use these IDs as row names in our matrix.
# If any duplicates exist, this step will make them unique by adding suffixes like ".1", ".2", etc.
phenotype_unique <- phenotype %>%
  mutate(isolate_id = make.unique(as.character(isolate_id)))
phenotype_unique

```

## Step 1: Ensure Unique Isolate IDs
```{r}
# Step 1: Ensure isolate IDs are unique
phenotype_unique <- phenotype %>%
  mutate(isolate_id = make.unique(as.character(isolate_id)))
phenotype_unique
```

##Step 2: Create Binary Resistance Matrix
```{r}
# Next, we create a resistance matrix where:
# - Rows are isolates
# - Columns are antibiotics
# - "R" is converted to 1 (resistant), and everything else to 0 (not resistant)
# This binary matrix is what we will use in our heatmap.
phenotype_matrix <- phenotype_unique %>%
  select(isolate_id, ciprofloxacin:colistin) %>%
  mutate(across(c(ciprofloxacin:colistin), ~ ifelse(. == "R", 1, 0))) %>%
  column_to_rownames("isolate_id") %>%
  as.matrix()
phenotype_matrix

```

##Step 3a: Create Annotations for Heatmap Rows
```{r}
# To make our heatmap easier to interpret, we add annotations:
# - Species and country will be shown alongside the rows of the heatmap.
# - We use str_to_title() to clean and standardize the formatting (e.g., "e.coli" → "E. Coli")
annotation_row <- phenotype_unique %>%
  select(isolate_id, species, country) %>%
  mutate(
    Species = str_to_title(species),
    Country = str_to_title(country)
  ) %>%
  select(isolate_id, Species, Country) %>%
  column_to_rownames("isolate_id")
annotation_row

```

##Step 3b: Order the Matrix and Annotations by Country 
```{r}
# Optionally, we can sort the heatmap by country (and implicitly by species within each country).
# This makes the heatmap visually cleaner and easier to interpret geographically.
annotation_row <- annotation_row %>%
  arrange(Country)

# We also need to reorder the resistance matrix to match the sorted annotation rows.
phenotype_matrix <- phenotype_matrix[rownames(annotation_row), ]

annotation_row
phenotype_matrix

```

##Step 4: Define Color Palettes for Annotations 
```{r}
# Now we assign specific colors to species and countries for the heatmap annotations.
# This ensures consistent and interpretable color coding across the plot.
species_colors <- setNames(
  RColorBrewer::brewer.pal(length(unique(annotation_row$Species)), "Set2"),
  unique(annotation_row$Species)
)
country_colors <- setNames(
  RColorBrewer::brewer.pal(length(unique(annotation_row$Country)), "Set3"),
  unique(annotation_row$Country)
)

# Combine the palettes into one list to pass to the heatmap
annotation_colors <- list(
  Species = species_colors,
  Country = country_colors
)

```

##Step 4: Preview Heatmap Without Annotations
```{r}
# If you want to preview the heatmap without any annotations, you can use this simpler version.
# This shows resistance patterns only, without the context of species or country.
pheatmap(
  phenotype_matrix,
  cluster_rows = FALSE,
  cluster_cols = FALSE,
  # annotation_row = annotation_row,         # <- comment out to remove annotations
  # annotation_colors = annotation_colors,   # <- comment out to remove annotation coloring
  display_numbers = TRUE,
  color = colorRampPalette(c("green", "red"))(100),
  main = "AMR Heatmap with Species and Antibiotics"
)

```

##Step 5: Final Heatmap with Annotations
```{r}
# Finally, we generate the full heatmap:
# - Each cell shows resistant (1 = red) or not resistant (0 = green)
# - Row annotations for species and country add biological and geographic context
# - No clustering is applied — rows and columns are displayed in the order we specified
pheatmap(
  phenotype_matrix,
  cluster_rows = FALSE,
  cluster_cols = FALSE,
  annotation_row = annotation_row,
  annotation_colors = annotation_colors,
  display_numbers = TRUE,
  color = colorRampPalette(c("green", "red"))(100),
  main = "AMR Heatmap with Species and Country Annotations"
)

```

## Using ComplexHeatmap
## Basic Heatmap Using ComplexHeatmap (No Annotations)

```{r eval=FALSE}
# Load required package
library(ComplexHeatmap)

# Create a basic heatmap (no annotations or clustering)
Heatmap(
  phenotype_matrix,                   # The resistance matrix (1 = R, 0 = S)
  name = "Resistance",                # Legend title
  col = c("0" = "seagreen", "1" = "tomato"),  # Color scale for susceptible and resistant
  cluster_rows = FALSE,              # Do not cluster rows (no dendrogram)
  cluster_columns = FALSE,           # Do not cluster columns
  row_names_side = "left",           # Show isolate names on the left
  column_title = "Antibiotics",      # Title for the columns
  row_title = "Isolates"             # Title for the rows
)

```

##Heatmap with Species and Country Annotations

```{R}
library(ComplexHeatmap)
library(circlize)  # For color mapping

# Step 1: Prepare annotation data
row_anno_data <- phenotype_unique %>%
  select(isolate_id, species, country) %>%   # Select relevant annotation columns
  distinct() %>%                             # Remove duplicates
  mutate(
    isolate_id = make.unique(as.character(isolate_id)),  # Ensure isolate IDs are unique
    species = str_to_title(species),                     # Standardize species names
    country = str_to_title(country)                      # Standardize country names
  ) %>%
  column_to_rownames("isolate_id")                       # Set isolate_id as row names

# Step 2: Define color schemes for annotations
species_levels <- unique(row_anno_data$species)
country_levels <- unique(row_anno_data$country)

anno_colors <- list(
  species = setNames(                               # Map each species to a unique color
    RColorBrewer::brewer.pal(length(species_levels), "Set2"),
    species_levels
  ),
  country = setNames(                               # Map each country to a unique color
    RColorBrewer::brewer.pal(length(country_levels), "Pastel1"),
    country_levels
  )
)

# Step 3: Create annotation object
row_anno <- rowAnnotation(
  df = row_anno_data,                 # Provide the species and country annotations
  col = anno_colors,                  # Use the defined color schemes
  show_annotation_name = TRUE         # Show annotation titles (e.g., Species, Country)
)

# Step 4: Plot annotated heatmap
Heatmap(
  phenotype_matrix,                   # Resistance matrix
  name = "Resistance",                # Legend title
  col = c("0" = "seagreen", "1" = "tomato"),  # Resistance colors
  cluster_rows = FALSE,              # Turn off row clustering
  cluster_columns = FALSE,           # Turn off column clustering
  left_annotation = row_anno,         # Add row annotations for species and country
  row_names_side = "left",            # Isolate IDs on the left
  column_title = "Antibiotics",       # Title for antibiotic columns
  row_title = "Isolates"              # Title for isolate rows
)

```

Your Task:
Data Cleaning:
Clean the species column to harmonize values like "E. Coli", "e.coli", "Salmonella sp.", etc.
Handle missing or empty resistance values ("", NA), and recode:
"R" → 1
"S" → 0
Blank or missing → leave as NA or impute sensibly.
Matrix Preparation:
Create a resistance matrix with rows = isolate_id, columns = antibiotics.
Make sure row names are unique.
Heatmap Visualization:
Create a basic heatmap with no clustering or annotations.
Then add annotations for:
species (color-coded)
country (color-coded)
Optionally, arrange the heatmap by country or species.


```{r}
# Load libraries
library(tidyverse)
library(pheatmap)
library(janitor)

# Read data
df <- read_csv("amr_phenotypes_dirty.csv") %>%
  clean_names()

# Clean species names
df <- df %>%
  mutate(species = case_when(
    str_detect(species, regex("coli", ignore_case = TRUE)) ~ "E. coli",
    str_detect(species, regex("salmonella", ignore_case = TRUE)) ~ "Salmonella sp.",
    TRUE ~ species
  ))

# Create binary resistance matrix
df_matrix <- df %>%
  mutate(across(c(ciprofloxacin, ampicillin, tetracycline, colistin),
                ~ ifelse(. == "R", 1, ifelse(. == "S", 0, NA_real_)))) %>%
  mutate(isolate_id = make.unique(isolate_id)) %>%
  column_to_rownames("isolate_id") %>%
  select(ciprofloxacin, ampicillin, tetracycline, colistin) %>%
  as.matrix()

# Optional: create annotation data
annotation <- df %>%
  select(isolate_id, species, country) %>%
  distinct() %>%
  column_to_rownames("isolate_id")

# Colors
ann_colors <- list(
  species = RColorBrewer::brewer.pal(length(unique(annotation$species)), "Set2") %>%
    setNames(unique(annotation$species)),
  country = RColorBrewer::brewer.pal(length(unique(annotation$country)), "Set3") %>%
    setNames(unique(annotation$country))
)

# Plot
pheatmap(df_matrix,
         annotation_row = annotation,
         annotation_colors = ann_colors,
         display_numbers = TRUE,
         cluster_rows = FALSE,
         cluster_cols = FALSE,
         color = colorRampPalette(c("green", "red"))(100),
         main = "AMR Heatmap (Cleaned & Annotated)")

```