
# Load the necessarily packages
library("tidytext")
library("textdata")
library("dplyr")
library("stringr")
library("tidyr")
library("lubridate")
library("ggplot2")
library("RColorBrewer")

# set the working directory
setwd("/Users/User/Documents/DataJournal/Projects/SentimentAnalysisTexts")

# Competing elements: fear and fatigue

###################### Create the dataset ##################

# read in the datasets
t = read.csv("allthetexts.csv", na.strings = c("NULL","", " "))
t$TextDateTime = as.POSIXct(t$TextDate, format = "%Y-%m-%d %H:%M:%S")
t$TextDate = as.Date(as.character(t$TextDateTime))

# remove duplicate rows
t = t[!is.na(t$GUID),]
t$IsFromMe[t$FromPhoneNumber %in% c("myfirstnumber","myicloudid","mysecondnumber")] = 1

# some texts were in an unreadable format, so I extracted the 
# body of the texts separately and need to merge them in
body = read.csv("readablemessageoutput.csv", skipNul = TRUE)
colnames(body)[1] = "GUID"

t = left_join(t, body)
t$MessageText[is.na(t$MessageText)] = t$body[is.na(t$MessageText)]

# filter to just my texts
t = t[t$IsFromMe == 1,]

# remove columns I don't need
t = t[,-which(colnames(t) %in% c("FromPhoneNumber","Service","IsFromMe","RoomName"))]

# I want a column for each phone number the text was sent to instead of a separate row for each recipient
t = t %>% group_by(rowid, ThreadId, TextDate, MessageText) %>%
  mutate(row = row_number())  %>%
  pivot_wider(names_from = row, values_from = ToPhoneNumber, names_prefix = "Recipient")

# keep just unique texts
t = t[!duplicated(t$GUID),]

# I don't want to include my reactions to other messages
t <- t %>%
  filter(!str_starts(MessageText, regex("^(loved|liked|questioned|emphasized|disliked|laughed at|removed a) ", ignore_case = TRUE)))

# I started having reliable data on March 2, 2017
t = t[t$TextDate > as.Date("2017-03-02"),]

###################### Format dataset, get sentiments ##############

t_full = t
t_full_text = t_full[,which(colnames(t_full) %in% c("ROWID","MessageText"))]

library("sentimentr")
sents = sentiment_by(t_full$MessageText)

# get the texts and sentiments in one dataframe
posneg = cbind(t_full, sents)

###################### Name people texted frequently ############

# See who I text most frequently
sort(table(posneg$Recipient1[is.na(posneg$Recipient2)]))

# now add people's names to subset on who the texts are to
posneg$nm = NA
posneg$nm[posneg$Recipient1 == "number here"] = "friend name" 
# repeat the last line many times for different people
posneg$nm[is.na(posneg$nm)] = "Other"

# check to make sure I named everyone I texted at least 1,000 times
sort(table(posneg$Recipient1[is.na(posneg$Recipient2) & is.na(posneg$nm)]))

###################### Function to make rainbow colors #########

# function to make well-separated colors
gg_color <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}
# create color scale so charts have consistent colors for each level
myColors <- gg_color(9)
posneg$name = as.factor(as.character(posneg$name))
names(myColors) <- levels(posneg$name)
colScale <- scale_fill_manual(name = "name",values = myColors)

###################### Graph n texts over time #################

ggplot(posneg, aes(x = TextDate)) +
  geom_histogram(binwidth = 7, fill = "#7f977d") +
  theme(legend.position = "none",
        text=element_text(size=14)) +
  scale_x_date(breaks = c(as.Date("2018-01-01"),"2019-01-01","2020-01-01","2021-01-01",
                          "2022-01-01","2023-01-01","2024-01-01","2025-01-01"), 
               labels = c("2018","2019","2020","2021","2022","2023","2024","2025"),
               limits = c(as.Date("2017-03-02"),as.Date("2025-12-08")),
               name = "Date",
               expand = c(0, 0)) +
  scale_y_continuous("Count of Texts I sent") +
  # annotate the graph
  annotate("text", x = as.Date("2018-09-01"), y = 2300, lineheight = 1, 
           label = "I was falling\nin love and she \nwas away on \na roadtrip") +
  annotate("text", x = as.Date("2020-07-01"), y = 1500, lineheight = 1, 
           label = "COVID") +
  annotate("text", x = as.Date("2021-05-01"), y = 1000, lineheight = 1, 
           label = "Missing\ndata") +
  geom_segment(x = as.Date("2018-01-01"), y = 2200, xend = as.Date("2017-10-01"), yend = 2100, # Start and end coordinates
               arrow = arrow(length = unit(0.25, "cm")), # Defines the arrow head
               linewidth = .3) +
  geom_segment(x = as.Date("2020-07-01"), y = 1400, xend = as.Date("2020-05-01"), yend = 1100, # Start and end coordinates
               arrow = arrow(length = unit(0.25, "cm")), # Defines the arrow head
               linewidth = .3) +
  geom_segment(x = as.Date("2021-02-01"), y = 950, xend = as.Date("2020-12-01"), yend = 200, # Start and end coordinates
               arrow = arrow(length = unit(0.25, "cm")), # Defines the arrow head
               linewidth = .3)

###################### Cleaning for sentiment graphs throughout week: time variables #############

# get values by hour
posneg <-
  posneg %>%
  mutate(
    weekday = wday(TextDateTime, label = TRUE),
    hour = hour(TextDateTime),
    date = date(TextDateTime),
    minute = minute(TextDateTime)
  )

# get all the other time variables we need 
posneg$hour_cont = posneg$hour + posneg$minute/60
posneg$Weekdayname = weekdays(posneg$TextDate)
posneg$year = year(posneg$TextDate)

###################### Get various sentiment variables - trinary, trinaryz #############

# make trinary info
posneg$trinary = ifelse(posneg$ave_sentiment > 0, 1, ifelse(posneg$ave_sentiment == 0, 0, -1))

# make z-scores for trinary
posneg = posneg %>% group_by(nm) %>% 
  mutate(sdt = sd(trinary), meant = mean(trinary)) %>% ungroup()
posneg$z_trinary = (posneg$trinary - posneg$meant) / posneg$sdt

###################### Compute rolling average values for weekly time points #############

posneg$weekdaynum <- as.integer(format(posneg$TextDateTime, "%w"))
posneg$total_minutes = posneg$weekdaynum * 24 * 60 + posneg$hour * 60 + posneg$minute

time_points <- expand.grid(
  minute = seq(0, 45, by = 15),
  hour = 0:23,
  weekday = c("Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday")
)
time_points$totalmin = seq(0,10065, by = 15)


# Function to compute rolling average for each (weekday, hour, minute)
compute_rolling_avg <- function(targetmin) {
  
  # Filter data within the 2-hour window for the same weekday
  window_data <- posneg %>%
    filter(abs(total_minutes - targetmin) %% 10080 < 60)
  
  # Compute the mean score
  c(mean(window_data$z_trinary, na.rm = T), 
    mean(window_data$trinary, na.rm = T), 
    ntexts = nrow(window_data))
}

# Apply the function to each time point
time_points$rolling_avg_trinary_z = NA
time_points$rolling_avg_trinary_raw = NA
time_points$ntexts = NA
time_points[,c((ncol(time_points)-2):ncol(time_points))] <- t(mapply(compute_rolling_avg, time_points$totalmin))

###################### Weekly graph ##############

# Make the end of the day 4am for the graphs
time_points$weekdaynum = as.numeric(time_points$weekday)
time_points$weekday2 = ifelse(time_points$hour < 4, time_points$weekdaynum - 1, time_points$weekdaynum)
time_points$hour2 = ifelse(time_points$hour < 4, time_points$hour + 24, time_points$hour)
time_points$weekday2[time_points$weekday2 == 0] = 7
time_points$weekday2 = factor(time_points$weekday2, levels = 1:7, labels = levels(time_points$weekday))
time_points$hours = (time_points$hour2*60 + time_points$minute)/60

# get the average for each day of the week
tpmean = time_points[time_points$ntexts > 2000,] %>% 
  group_by(weekday2) %>%
  summarize(
    mean_trinary = mean(rolling_avg_trinary_raw),
    mean_trinary_z = mean(rolling_avg_trinary_z)
  )

# graph sentiments throughout the week
time_points[time_points$ntexts >= 2000,] %>% 
  ggplot(aes(hours, rolling_avg_trinary_raw, color = weekday2)) +
  geom_point() + 
  geom_line() +
  facet_grid(.~weekday2) + 
  xlab("Time") +
  ylab("Average Sentiment") +
  ggtitle("Average Sentiment of Texts Throughout the Week") +
  geom_hline(data = tpmean, aes(yintercept = mean_trinary, color = weekday2),lty='dashed') +
  labs(color = "Weekday") + 
  scale_x_continuous(
    breaks = c(8,12,16,20,24),  # adjust to your actual break points
    labels = c("8am", "noon", "4pm", "8pm", "Midnight")) +
  theme(text=element_text(size=16),
        axis.text.x = element_text(angle = 60, hjust = 1, size = 9),
        plot.title = element_text(hjust = 0.5),
        plot.title.position = "plot")

# Sensitivity analyses - graph z-scores instead of raw values
time_points[time_points$ntexts >= 2000,] %>% 
  ggplot(aes(hours, rolling_avg_trinary_z, color = weekday2)) +
  geom_point() + 
  geom_line() +
  facet_grid(.~weekday2) + 
  xlab("Time") +
  ylab("Average Sentiment") +
  geom_hline(data = tpmean, aes(yintercept = mean_trinary_z, color = weekday2),lty='dashed') +
  labs(color = "Weekday") + 
  ggtitle("Average Sentiment of Texts Compared to Recipient Average Throughout the Week") +
  scale_x_continuous(
    breaks = c(8,12,16,20,24),  # adjust to your actual break points
    labels = c("8am", "noon", "4pm", "8pm", "Midnight")) +
  theme(text=element_text(size=16),
        axis.text.x = element_text(angle = 60, hjust = 1, size = 9),
        plot.title = element_text(hjust = 0.5),
        plot.title.position = "plot")