#################################################################################################
#Script name: ITS2 Amplicon data
#Purpose: Analyse and visualize plant NGS Amplicon ITS2 data from Pentatoma rufipes and Halyomorpha halys guts
#Author: Olivier Berteloot
#Date: 18-01-2024
#Version: 1.0
#E-mail: olivier.berteloot@gmail.com
################################################################################################

# Where is the working directory
getwd()

# Load packages
library(tidyverse)
library(openxlsx)
library(MASS)
library(car)
library(ggplot2)
library(RColorBrewer)
library(forcats)
library(dplyr)
library(scales)
library(Biostrings)
library(gridExtra)
library(vegan)
library(lubridate)

#################################################################################################
##################PREPROCESSIN: FIELD COLLECTED SAMPLES #########################################
#################################################################################################

### read the table, taxonomy and metadata
otutable <- read_tsv('table_clustered_all.tsv')
otutaxonomy <- read_tsv('taxonomy_consensus_clustered_all.tsv')
meta_all <- read_tsv("meta_all.tsv")

# Merge the two data frames based on the common column 'otu id'
merged_data <- inner_join(otutable, otutaxonomy, by = "otu_id")

# Extract the entire column of taxon from otutaxonomy
taxon_column <- otutaxonomy$taxon

# Adjust column names
sample_columns <- names(merged_data)[which(!names(merged_data) %in% c("otu ids", "taxon"))]

# Identify and handle non-numeric values in the sample count columns
merged_data[sample_columns] <- lapply(merged_data[sample_columns], function(x) {
  ifelse(!grepl("^\\d+$", as.character(x), ignore.case = TRUE), NA, as.numeric(x))
})

# Group by taxon and summarize the counts for each sample
aggregated_data <- merged_data %>%
  group_by(taxon) %>%
  summarise(across(all_of(sample_columns), sum, na.rm = TRUE))

# Print the aggregated data frame
print(aggregated_data)

# Remove the 'otu_id' column
aggregated_data <- dplyr::select(aggregated_data, -2)

long_data <- aggregated_data %>%
  pivot_longer(-taxon, names_to = "sample_id", values_to = "count")

# Left join the long format data with the metadata based on 'sample_id'
combined_data <- left_join(long_data, meta_all, by = c("sample_id" = "sample_id"))

print(combined_data)

# Save to file
write.table(combined_data, file='combined_data.tsv', sep='\t', row.names=FALSE)

# Identify sample-id's with total count < 10000
sample_ids_to_exclude <- combined_data %>%
  group_by(`sample_id`) %>%
  summarize(total_count = sum(count)) %>%
  filter(total_count < 10000) %>%
  pull(`sample_id`) # Extracts the sample-id's to a vector
print(sample_ids_to_exclude)

# Filter out these sample-id's from the original dataset
filtered_data <- combined_data %>%
  filter(!(`sample_id` %in% sample_ids_to_exclude))

print(filtered_data)

# Filter out rows where the count is not equal to 0
filtered_data <- filtered_data %>%
  filter(count != 0)

print(filtered_data)

# Calculate the relative abundance per sample as a percentage
filtered_data_rel <- filtered_data %>%
  group_by(sample_id) %>%
  mutate(relative_abundance = (count / sum(count)) * 100)

# Filter out rows where the relative abundance is lower than 1%
filtered_data_rel <- filtered_data_rel %>%
  filter(relative_abundance >= 1)

head(filtered_data_rel)

write.table(combined_data, file='filtered_data_rel.tsv', sep='\t', row.names=FALSE)

#################################################################################################
################## Metric: unique genera per stink bug species ##################################
#################################################################################################

#unassigned & chlorophyta can not be counted as a taxon in this case

filtered_data_rel_removed <- filtered_data_rel %>%
  filter(taxon != "Unassigned" & taxon !="Chlorophyta")
head(filtered_data_rel_removed)

#for H. halys
median_taxon_per_sample_halyomorpha_halys <- filtered_data_rel_removed %>%
  filter(species == "Halyomorpha halys") %>%
  group_by(sample_id) %>%
  summarize(num_taxon = n_distinct(taxon)) %>%
  summarize(median = median(num_taxon))

print(median_taxon_per_sample_halyomorpha_halys)

#for P. rufipes
median_taxon_per_sample_pentatoma_rufipes <- filtered_data_rel_removed %>%
  filter(species == "Pentatoma rufipes") %>%
  group_by(sample_id) %>%
  summarize(num_taxon = n_distinct(taxon)) %>%
  summarize(median = median(num_taxon))

print(median_taxon_per_sample_pentatoma_rufipes)

#################################################################################################
################## visualization: Barplots ######################################################
#################################################################################################

# Define a function to aggregate genera with relcount < 5% into 'Other', as otherwise barplots might be too  hard to interpret
aggregate_taxon <- function(df) {
  # Threshold for relative abundance
  threshold <- 5
  
  # Mark rows with relcount < threshold as 'Other'
  df$taxon <- ifelse(df$relative_abundance < threshold, 'Other', df$taxon)
  
  return(df)
}

# Apply the function 
filtered_data_rel_other <- filtered_data_rel_removed %>%
  group_by(sample_id) %>%
  do(aggregate_taxon(.)) %>%
  ungroup() %>% # To remove the grouping structure
  dplyr::select(sample_id, everything()) # Explicitly use dplyr's select function


# View the first few rows of the modified dataset
head(filtered_data_rel_other)

write.table(filtered_data_rel_other, file='filtered_data_rel_other.tsv', sep='\t', row.names=FALSE)

df<-filtered_data_rel_other

# subset per stink bug species
halys_data <- df %>%
  filter(species == "Halyomorpha halys")

penta_data <- df %>%
  filter(species == "Pentatoma rufipes")

# Vector of all unique taxons (that will appear in the barplot)
all_taxons <- unique(df$taxon)
print(all_taxons)

# Vector for Halyomorpha halys
halys_taxons <- unique(halys_data$taxon)

# Vector for Pentatoma rufipes
penta_taxons <- unique(penta_data$taxon)

# Function to sort vectors while keeping "Other" and "Unassigned" at the end
sort_vector <- function(vector) {
  special_cases <- c("Other")
  sorted_vector <- sort(setdiff(vector, special_cases))
  special_in_vector <- intersect(special_cases, vector)
  return(c(sorted_vector, special_in_vector))
}

# Sorting the vectors
all_taxons_sorted <- sort_vector(all_taxons)
halys_taxons_sorted <- sort_vector(halys_taxons)
penta_taxons_sorted <- sort_vector(penta_taxons)

halyo_order<-halys_taxons_sorted
penta_order<-penta_taxons_sorted

print(halyo_order)
print(penta_order)

#Create custom color palettes
n <- 14
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector_p = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
pie(rep(1,n), col=sample(col_vector_p, n))


nh <- 46
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector_h = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
pie(rep(1,nh), col=sample(col_vector_h, nh))


# BARPLOT FOR HALYOMORPHA HALYS 
halyo_plot <- ggplot(halys_data, aes(x = sample_id, y = relative_abundance, fill = taxon)) +
  geom_bar(stat = "identity", position = "fill", width = 0.5) +
  scale_y_continuous(name = "Relative abundance", labels = percent_format()) +
  scale_fill_manual(values = col_vector_h, breaks = halyo_order) +
  ggtitle("Halyomorpha halys") +
  labs(fill = "Genus") +  # Setting the legend title
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold.italic", hjust = 0.5),
    axis.text.x = element_text(angle = 90, vjust = 0.5, size = 6),
    axis.text.y = element_text(color = "black"),
    strip.text = element_text(angle = 90, face = "bold"),
    strip.background = element_blank(),
    panel.border = element_rect(fill = NA, color = "black"),
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank()
  )
print(halyo_plot)

# BARPLOT FOR PENTATOMA RUFIPES, FACET GRID BY HOSTPLANT
penta_plot <- ggplot(penta_data, aes(x = sample_id, y = relative_abundance, fill = factor(taxon, levels = penta_order))) +
  geom_bar(stat = "identity", position = "fill", width = 0.5) +
  facet_grid(~ hostplant, scales = "free_x", space = "free_x") +
  scale_y_continuous(name = "Relative abundance", labels = percent_format()) +
  scale_fill_manual(values = col_vector_p, breaks = penta_order) + # Assuming col_vector_h is defined
  ggtitle("Pentatoma rufipes") +
  theme_minimal() +
  labs(fill = "Genus") +  # Setting the legend title
  theme(
    plot.title = element_text(size = 14, face = "bold.italic", hjust = 0.5),
    axis.text.x = element_text(angle = 90, vjust = 0.5, size = 6),
    axis.text.y = element_text(color = "black"),
    strip.text = element_text(angle = 45, face = "bold"),
    strip.background = element_blank(),
    panel.border = element_rect(fill = NA, color = "black"),
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank()
  )

print(penta_plot)

##########################################################################################
################## visualization: top 10 histogram #######################################
##########################################################################################

df_filt<-filtered_data_rel_removed

###ranked data
ranked_halyo <- df_filt %>%
  filter(species == "Halyomorpha halys") %>%
  group_by(taxon) %>%
  summarise(Count = n_distinct(sample_id), .groups = "drop") %>%
  arrange(desc(Count))
print(ranked_halyo)


ranked_penta <- df_filt %>%
  filter(species == "Pentatoma rufipes") %>%
  group_by(taxon) %>%
  summarise(Count = n_distinct(sample_id), .groups = "drop") %>%
  arrange(desc(Count))
print(ranked_penta)

# Calculate the top 10 genera for Halyomorpha halys based on counts
top_10_genera_halyo <- df_filt %>%
  filter(species == "Halyomorpha halys") %>%
  group_by(taxon) %>%
  summarise(Count = n_distinct(sample_id), .groups = "drop") %>%
  arrange(desc(Count)) %>%
  top_n(10, Count)

top_10_genera_halyo

# Calculate the top 10 genera for Pentatoma rufipes based on counts
top_10_genera_penta <- df_filt %>%
  filter(species == "Pentatoma rufipes") %>%
  group_by(taxon) %>%
  summarise(Count = n_distinct(sample_id), .groups = "drop") %>%
  arrange(desc(Count)) %>%
  top_n(10, Count)
top_10_genera_penta

##calculate the top 10 genera for both species
top_10_genera_all<-df_filt %>%
  group_by(taxon) %>%
  summarise(Count = n_distinct(sample_id), .groups = "drop") %>%
  arrange(desc(Count)) %>%
  top_n(10, Count)

# Identify common genera
common_genera <- intersect(top_10_genera_halyo$taxon, top_10_genera_penta$taxon)

# Assign colors to common genera (the same colors in both plots) from RColorBrewer Set1 palette
library(RColorBrewer)
common_colors <- brewer.pal(length(common_genera), "Set1")

# Assign colors to non-shared genera for each species
unique_colors_halyo <- brewer.pal(length(setdiff(top_10_genera_halyo$taxon, common_genera)), "Set2")
unique_colors_penta <- brewer.pal(length(setdiff(top_10_genera_penta$taxon, common_genera)), "Set3")

# Create a mapping of common genera to colors
common_colors_mapping <- setNames(common_colors, common_genera)

# Map colors to both species based on common genera
top_10_genera_halyo$Color <- ifelse(top_10_genera_halyo$taxon %in% common_genera, common_colors_mapping[top_10_genera_halyo$taxon], unique_colors_halyo)
top_10_genera_penta$Color <- ifelse(top_10_genera_penta$taxon %in% common_genera, common_colors_mapping[top_10_genera_penta$taxon], unique_colors_penta)

# Create separate plots for each species with custom color palette
plot_halyo <- ggplot(top_10_genera_halyo, aes(x = reorder(taxon, -Count), y = Count, fill = Color)) +
  geom_col() +
  labs(title = "Top 10 most detected genera in samples (Halyomorpha halys)",
       x = "taxon", y = "Number of samples present") +
  scale_fill_identity() +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

plot_halyo

plot_penta <- ggplot(top_10_genera_penta, aes(x = reorder(taxon, -Count), y = Count, fill = Color)) +
  geom_col() +
  labs(title = "Top 10 most detected genera in samples (Pentatoma rufipes)",
       x = "taxon", y = "Number of samples present") +
  scale_fill_identity() +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

plot_penta

# Display the plots side by side
grid.arrange(plot_halyo, plot_penta, ncol = 2)

##########################################################################################
################## visualization: hostplant matching (P. rufipes) ########################
##########################################################################################

penta_data_all <- filtered_data_rel_removed %>%
  filter(species == "Pentatoma rufipes")

percentage_matching <- penta_data_all %>%
  group_by(sample_id) %>%
  summarize(has_match = any(taxon == hostplant)) %>%
  summarize(percentage = mean(has_match) * 100)

percentage_matching

# Create a data frame for plotting
pie_data <- data.frame(
  label = c("Matching", "Not matching"),
  value = c(percentage_matching$percentage, 100 - percentage_matching$percentage)
)

# Create a pie chart with percentage labels
pie_chart <- ggplot(pie_data, aes(x = "", y = value, fill = label)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar(theta = "y") +
  labs(fill = "Match Status") +
  ggtitle("Percentage of samples with taxon matching hostplant for Pentatoma rufipes") +
  theme_void() +
  theme(legend.title = element_blank())

# Add percentage labels to the pie chart
pie_chart_with_labels <- pie_chart +
  geom_text(aes(label = paste0(round(value, 1), "%")), position = position_stack(vjust = 0.5))

# Display the pie chart with percentage labels
print(pie_chart_with_labels)

##########################################################################################
################## visualization: counts per taxon per stinkbug   ########################
##########################################################################################

# Group the data by species and taxon, and calculate the sum of counts for each taxon within each species
ranked_data <- filtered_data_rel %>%
  group_by(species, taxon) %>%
  summarize(total_count = sum(count)) %>%
  arrange(species, desc(total_count)) %>%
  mutate(rank = row_number())

# View the ranked data
head(ranked_data)


# Create a barplot to visualize the ranking of genera within each species
ggplot(ranked_data, aes(x = reorder(taxon, -total_count), y = total_count, fill = species)) +
  geom_bar(stat = "identity") +
  labs(title = "Ranking of Genera by Total Count Within Each Species",
       x = "taxon",
       y = "Total Count") +
  scale_fill_brewer(palette = "Set3") +  # You can choose a different color palette if desired
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

##########################################################################################
################## visualization: percentages per taxon per stinkbug   ###################
##########################################################################################

# Calculate the percentage of total reads each taxon represents within each species
ranked_data <- ranked_data %>%
  group_by(species) %>%
  mutate(Percentage = (total_count / sum(total_count)) * 100) %>%
  arrange(species, desc(Percentage))

# Create a barplot to visualize the percentage of total reads
ggplot(ranked_data, aes(x = reorder(taxon, -Percentage), y = Percentage, fill = species)) +
  geom_bar(stat = "identity") +
  labs(title = "Percentage of Total Reads by taxon Within Each Species",
       x = "taxon",
       y = "Percentage of Total Reads (%)") +
  scale_fill_brewer(palette = "Set3") +  # You can choose a different color palette if desired
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Get the top ten genera for each species
top_ten_genera <- ranked_data %>%
  group_by(species) %>%
  top_n(10, wt = Percentage) %>%
  ungroup() %>%
  arrange(species, desc(Percentage))

# Create a barplot to visualize the top ten genera by species
ggplot(top_ten_genera, aes(x = reorder(taxon, -Percentage), y = Percentage, fill = species)) +
  geom_bar(stat = "identity") +
  labs(title = "Top Ten Genera by Percentage of Total Reads Within Each Species",
       x = "taxon", y = "Percentage of Total Reads (%)") +
  scale_fill_brewer(palette = "Set3") +  # You can choose a different color palette if desired
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

##########################################################################################
################## METRIC: PERMANOVAS    #################################################
##########################################################################################

#clean up the metadata
colnames(meta_all)
meta <- meta_all %>%
  dplyr:::select(-label)
meta <- meta %>%
  dplyr:::select(-ugentlabel)
meta <- meta %>%
  dplyr:::select(-lat)
meta <- meta %>%
  dplyr:::select(-coords)
meta <- meta %>%
  dplyr:::select(-lon)
meta <- meta %>%
  dplyr:::select(-source)
meta <- meta %>%
  dplyr:::select(-lifestage)
meta <- meta %>%
  dplyr:::select(-whatstage)
meta <- meta %>%
  dplyr:::select(-hostplant)
colnames(meta)
meta <- meta %>%
  mutate(date = dmy(date),  # Ensure the 'date' column is in Date format
         season = case_when(
           month(date) %in% 3:5 ~ "Spring",
           month(date) %in% 6:8 ~ "Summer",
           month(date) %in% 9:11 ~ "Autumn",
           TRUE ~ "Winter"
         ))

colnames(meta)
head(meta)
dim(meta)
meta$landscape_type <- interaction(meta$landscape, meta$type)

# Assuming your "date" column is in Date format, if not, convert it first
meta$date <- as.Date(meta$date)

# Create a new column "month"
meta$month <- month(meta$date, label = TRUE)

# View the updated meta data frame
head(meta)

# make subsets per stink bug species
meta_halys <- meta %>% filter(species == "Halyomorpha halys")
meta_penta <- meta %>% filter(species == "Pentatoma rufipes")

#remove chlorophyta and unassigned from OTU frame
aggregated_data_otu<- aggregated_data %>%
  filter(taxon != "Unassigned" & taxon !="Chlorophyta")
head(aggregated_data_otu)

# Filter the sample IDs that have 'Halyomorpha halys' as the species
halys_sample_ids <- meta_all %>%
  filter(species == "Halyomorpha halys") %>%
  pull(sample_id)

# Create an OTU table with only the columns corresponding to 'halys_sample_ids'
otutable_halys <- aggregated_data_otu %>%
  dplyr:::select(one_of(halys_sample_ids))

#create otutable with RRA for h. halys
total_reads_per_sample <- colSums(otutable_halys)

otutable_halys_RRA <- sweep(otutable_halys, 2, total_reads_per_sample, FUN="/") * 100

otutable_halys_RRA

# Filter the sample IDs that have 'Pentatoma rufipes' as the species
penta_sample_ids <- meta %>%
  filter(species == "Pentatoma rufipes") %>%
  pull(sample_id)

# Create an OTU table with only the columns corresponding to 'penta_sample_ids'
otutable_penta <- aggregated_data_otu %>%
  dplyr:::select(one_of(penta_sample_ids))

#create otutable with RRA for p. rufipes
total_reads_per_sample_penta <- colSums(otutable_penta)

otutable_penta_RRA <- sweep(otutable_penta, 2, total_reads_per_sample_penta, FUN="/") * 100

otutable_penta_RRA


#prepare for PERMANOVAS
dim(RRA_halys)
dim(meta_halys)

# List of sample IDs to remove
sample_ids_to_remove <- c("X111", "X120", "X23", "X27", "X30", "X55", "X63", "X67", "X68", "X76", "X82", "X89", "X93")

# Filter out rows with these sample IDs
meta_halys <- meta_halys %>%
  filter(!(sample_id %in% sample_ids_to_remove))

RRA_halys<- RRA_halys[!(rownames(RRA_halys) %in% sample_ids_to_remove), ]

RRA_halys_t<-t(RRA_halys)

permanova_results_halys <- list()

permanova_results_halys<- adonis2(RRA_halys ~ location + month, data=meta_halys, permutations=9999)

permanova_results_halys

# Filter the sample IDs that have 'Pentatoma rufipes' as the species
penta_sample_ids <- meta_all %>%
  filter(species == "Pentatoma rufipes") %>%
  pull(sample_id)

# Create an OTU table with only the columns corresponding to 'halys_sample_ids'
otutable_penta <- aggregated_data_otu %>%
  dplyr:::select(one_of(penta_sample_ids))

total_reads_per_sample_penta <- colSums(otutable_penta)
otutable_penta_RRA <- sweep(otutable_penta, 2, total_reads_per_sample_penta, FUN="/") * 100

# Print the RRA table to see the result
print(otutable_penta_RRA)

otutable_penta_RRA
RRA_penta<-t(otutable_penta_RRA)

permanova_results_penta <- list()

permanova_results_penta<- adonis2(RRA_penta ~ location + month, data=meta_penta, permutations=9999)

permanova_results_penta


##########################################################################################
################## METRIC: number of OTUs & median read length    ########################
##########################################################################################

## read the ASV's
sequences_all <- readDNAStringSet("dna-sequences_clustered_all.fasta")
sequences_lab<-readDNAStringSet("dna-sequences_clustered_lab.fasta")


#######################SNIPPET FOR MEDIAN LENGTH AND NUMBER OF ASVS before filtering
num_sequences <- length(sequences_all)
print(num_sequences)


combined_fasta <- c(sequences_all, sequences_lab)

writeXStringSet(combined_fasta, filepath = "combined.fasta")



sequence_lengths_all <- width(sequences_all)
sequence_lengths_lab <- width(sequences_lab)
sequence_lengths<-width(combined_fasta)


median_length_all <- median(sequence_lengths_all)
median_length_lab <- median(sequence_lengths_lab)
median_length<-median(sequence_lengths)
num_sequences <- length(sequences_all)
print(num_sequences)
print(median_length)



##########################################################################################
################## PREPROCESSING: lab samples/timeseries   ###############################
##########################################################################################

### read the data
otutable_lab <- read_tsv('table_clustered_lab.tsv')
otutaxonomy_lab <- read_tsv('taxonomy_consensus_clustered_lab.tsv')
meta_lab <- read_tsv("meta_lab.tsv")

# Merge the two data frames based on the common column 'otu ids'
merged_data_lab <- inner_join(otutable_lab, otutaxonomy_lab, by = "otu_id")

# Extract the entire column of taxon from otutaxonomy
taxon_column_lab <- otutaxonomy_lab$taxon

# Adjust column names
sample_columns_lab <- names(merged_data_lab)[which(!names(merged_data_lab) %in% c("otu_id", "taxon"))]

# Identify and handle non-numeric values in the sample count columns
merged_data_lab[sample_columns_lab] <- lapply(merged_data_lab[sample_columns_lab], function(x) {
  ifelse(!grepl("^\\d+$", as.character(x), ignore.case = TRUE), NA, as.numeric(x))
})

# Group by taxon and summarize the counts for each sample
aggregated_data_lab <- merged_data_lab %>%
  group_by(taxon) %>%
  summarise(across(all_of(sample_columns_lab), sum, na.rm = TRUE))

# Print the aggregated data frame
print(aggregated_data_lab)

long_data_lab <- aggregated_data_lab %>%
  pivot_longer(-taxon, names_to = "sample_id", values_to = "count")

# Left join the long format data with the metadata based on 'sample_id'
combined_data_lab<- left_join(long_data_lab, meta_lab, by = c("sample_id" = "sample_id"))

print(combined_data_lab)

# Save to file
write.table(combined_data_lab, file='combined_data_lab.tsv', sep='\t', row.names=FALSE)

# Filter out rows where the count is not equal to 0
filtered_data_lab <- combined_data_lab%>%
  filter(count != 0)

# Calculate the relative abundance per sample as a percentage
filtered_data_lab_rel <- filtered_data_lab %>%
  group_by(sample_id) %>%
  mutate(relative_abundance = (count / sum(count)) * 100)

head(filtered_data_lab_rel)

# Calculate the total count
total_count <- sum(combined_data_lab$count)

# Extract the counts for the specified taxa
selected_taxa_counts <- combined_data_lab %>%
  filter(taxon %in% c("Phaseolus", "Helianthus", "Malus")) %>%
  pull(count)

# Sum the counts for the specified taxa
selected_taxa_total <- sum(selected_taxa_counts)

# Calculate the percentage
percentage <- (selected_taxa_total / total_count) * 100

# Print the percentage
print(percentage)

# Filter the dataset to keep only rows with the specified taxa
filtered_data <- filtered_data_lab_rel %>%
  filter(taxon %in% c("Phaseolus", "Helianthus", "Malus"))

# View the filtered data
head(filtered_data)

write.table(filtered_data_lab_rel, file='filtered_data_lab_rel.tsv', sep='\t', row.names=FALSE)

df_lab<-filtered_data



##########################################################################################
################## VIZSUALIZATION: barplot of lab samples   ##############################
##########################################################################################

# BARPLOT FOR labsamples, FACET GRID BY time ###############
timeplot <- ggplot(df_lab, aes(x = sample_id, y = relative_abundance, fill = taxon)) +
  geom_bar(stat = "identity", position = "fill", width = 0.5) +
  facet_wrap(~ time, scales = "free_x", labeller = labeller(time = function(x) paste0(x, " hours")))+
  scale_y_continuous(name = "Relative abundance", labels = percent_format()) +
  labs(fill = "Genus", face="bold") +
  scale_fill_manual(values = c("Helianthus" = "#9ACDF0",  
                               "Malus" = "#FCAE52",     
                               "Phaseolus" = "#8EDB75")
  )+
  labs(x = "Sample")+
  scale_x_discrete(labels = NULL)+
  theme(
    plot.title = element_text(size = 16, face = "bold.italic", hjust = 0.5),
    axis.text.x = element_text(angle = 0, vjust = 0.5, size = 6),
    axis.text.y = element_text(color = "black", size = 12, face = "bold"),
    axis.title.x = element_text(size = 14, face = "bold", margin = margin(t = 10)),
    axis.title.y = element_text(size = 14, face = "bold", margin = margin(r = 10)),
    strip.text = element_text(size =12, angle = 0, face = "bold"),
    strip.background = element_blank(),
    panel.border = element_rect(fill = NA, color = "black"),
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank(),
    panel.background= element_blank(),
    legend.text = element_text(face = "italic", hjust = 0)
  )
print(timeplot)