exploring_graph_data.Rmd

---
title: "Alternative Explorations of the Online Retail Dataset"
author: "Mick Cooney <mickcooney@gmail.com>"
date: "Last updated: `r format(Sys.time(), '%B %d, %Y')`"
output:
  rmdformats::readthedown:
    toc_depth: 3
    use_bookdown: TRUE
    code_folding: hide
    fig_caption: TRUE

  html_document:
    fig_caption: yes
    theme: spacelab #sandstone #spacelab #flatly
    highlight: pygments
    number_sections: TRUE
    toc: TRUE
    toc_depth: 2
    toc_float:
      smooth_scroll: FALSE

  pdf_document: default
---


```{r import_libraries, echo=FALSE, message=FALSE}
library(conflicted)
library(tidyverse)
library(scales)
library(cowplot)
library(magrittr)
library(rlang)
library(purrr)
library(furrr)
library(glue)
library(DT)
library(tidygraph)


source("lib_utils.R")

conflict_lst <- resolve_conflicts(
  c("magrittr", "rlang", "dplyr", "readr", "purrr", "ggplot2")
  )


knitr::opts_chunk$set(
  tidy       = FALSE,
  cache      = FALSE,
  warning    = FALSE,
  message    = FALSE,
  fig.height =     8,
  fig.width  =    11
  )

options(
  width = 80L,
  warn  = 1,
  mc.cores = parallel::detectCores()
  )

theme_set(theme_cowplot())

set.seed(42)

plan(multisession)
```


# Load Data

We first want to load our datasets and prepare them for some simple association
rules mining.

```{r load_transaction_data, echo=TRUE}
tnx_data_tbl <- read_rds("data/retail_data_cleaned_tbl.rds")

tnx_data_tbl %>% glimpse()
```

To use our rules mining we just need the invoice data and the stock code, so
we can ignore the rest. Also, we ignore the issue of returns and just look at
purchases.

```{r prepare_data_arules, echo=TRUE}
tnx_purchase_tbl <- tnx_data_tbl %>%
  filter(
    quantity > 0,
    price > 0,
    exclude == FALSE
    ) %>%
  drop_na(customer_id) %>%
  mutate(stock_code = str_to_upper(stock_code)) %>%
  select(
    invoice_id, stock_code, customer_id, quantity, price, stock_value,
    description
    )

tnx_purchase_tbl %>% glimpse()
```

We also load the stock description data, as this will be useful as part of the
graph analysis.

```{r load_stock_description, echo=TRUE}
stock_description_tbl <- read_rds("data/stock_description_tbl.rds")

stock_description_tbl %>% glimpse()
```


# Basket Analysis

We want to do some basic basket analysis before we investigate using full
'association rules' libraries for our modelling.

Rather than repeating all the logic, it might be worth looking at building
some basic code to construct all the item pairs in each basket.


```{r construct_basket_pairs, echo=TRUE}
construct_pairwise_items <- function(stock_codes) {
  pairs_tbl <- expand_grid(
      one = stock_codes,
      two = stock_codes
      ) %>%
    filter(one < two)
  
  return(pairs_tbl)
}

basket_data_tbl <- tnx_purchase_tbl %>%
  group_by(invoice_id) %>%
  summarise(
    .groups = "drop",
    
    stock_codes = list(stock_code)
    ) %>%
  mutate(
    basket_data = future_map(
      stock_codes, construct_pairwise_items,
      .progress = TRUE
      )
    )

basket_data_tbl %>% glimpse()
```


# Graph Analysis

We can treat this data as a graph, turning both invoices and stock items into
nodes on the graph, and create a connection between stock and invoices when the
item occurs on the invoice.

This graph will get large, but it is a starting point.


```{r construct_basket_graph, echo=TRUE}
stock_nodes_tbl <- tnx_purchase_tbl %>%
  select(stock_code) %>%
  distinct() %>%
  transmute(node_label = stock_code, node_type = "stock")

invoice_nodes_tbl <- tnx_purchase_tbl %>%
  select(invoice_id) %>%
  distinct() %>%
  transmute(node_label = invoice_id, node_type = "invoice")

nodes_tbl <- list(stock_nodes_tbl, invoice_nodes_tbl) %>%
  bind_rows()

edges_tbl <- tnx_purchase_tbl %>%
  group_by(stock_code, invoice_id) %>%
  summarise(
    .groups = "drop",
    
    total_quantity = sum(quantity),
    total_cost     = sum(quantity * price)
    )


basket_tblgraph <- tbl_graph(
  nodes    = nodes_tbl,
  edges    = edges_tbl,
  directed = FALSE,
  node_key = "node_label"
)
```


## Check Graph Clustering Approaches

First we perform our basic clustering by splitting off the different disjoint
components of the graph.

```{r create_disjoint_component_labels, echo=TRUE}
basket_tblgraph <- basket_tblgraph %>%
  mutate(
    component_id = group_components()
    ) %>%
  group_by(component_id) %>%
  mutate(
    component_size = n()
    ) %>%
  ungroup()

basket_tblgraph %>% print()
```

We now want to check the sizes of the disjoint components of this graph.

```{r display_main_component_sizes, echo=TRUE}
basket_tblgraph %>%
  as_tibble() %>%
  filter(node_type == "stock") %>%
  count(component_id, name = "stock_count", sort = TRUE)
```

We see that almost all the stock codes are contained in that one large
component and so confine the rest of this analysis to that one large component.

```{r run_subgraph_clusters, echo=TRUE}
run_subgraph_clusters <- function(graph_cluster_func, labelling, input_tblgraph, ...) {
  message(glue("Clustering the graph using {labelling}..."))
  
  subgraph_clusters_tbl <- input_tblgraph %>%
    mutate(
      cluster_id = graph_cluster_func(...)
      ) %>%
    activate(nodes) %>%
    as_tibble() %>%
    filter(node_type == "stock") %>%
    count(cluster_id, name = "cluster_size", sort = TRUE) %>%
    mutate(
      cluster_id = factor(1:n(), levels = 1:n())
    )
  
  return(subgraph_clusters_tbl)
}
```


```{r test_subgraph_cluster_sizes, echo=TRUE, cache=TRUE}
cluster_func <- c(
    "group_fast_greedy",
    "group_infomap",
    "group_leading_eigen",
    "group_louvain"
    )

largecomp_tblgraph <- basket_tblgraph %>%
  convert(to_subgraph, component_size == max(component_size))

cluster_data_tbl <- tibble(cluster_func_name = cluster_func) %>%
  mutate(
    cluster_func = map(cluster_func_name, get),
    clustered    = map2(
      cluster_func, cluster_func_name,
      run_subgraph_clusters,
      input_tblgraph = largecomp_tblgraph
      )
    ) %>%
  select(cluster_func_name, clustered) %>%
  unnest(clustered)

cluster_data_tbl %>% glimpse()
```

Having created a summary of the data splits, we now want to construct a
visualisation of how the various cluster routines split the data.

To do this, we turn the size of each cluster into a 'label' and then count how
many clusters of that size there are. We then use this summary data to
construct barplots of the size.

```{r visualise_community_splits, echo=TRUE}
plot_tbl <- cluster_data_tbl %>%
  group_by(cluster_func_name) %>%
  count(cluster_size, name = "cluster_count", sort = TRUE) %>%
  ungroup() %>%
  mutate(cluster_size = as.factor(cluster_size))


ggplot(plot_tbl) +
  geom_col(aes(x = cluster_size, y = cluster_count, group = cluster_size)) +
  facet_wrap(vars(cluster_func_name), scales = "free") +
  labs(
    x = "Cluster Size",
    y = "Community Count",
    title = "Visualisation of Spread of Cluster Sizes"
    ) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
```

From this graphic, we see that we want to use `group_louvain` gives us the
most even split across the data - though the sizes are still hugely unequal.


## Create Cluster-Based Allocation

We now use this algorithm to cluster this large component in the graph, and
this gives us an alternative allocation of the each `stock_code` to a product
group.

```{r cluster_largest_component_louvain, echo=TRUE}
largecomp_clustered_tbl <- largecomp_tblgraph %>%
  mutate(
    cluster_id = group_louvain()
    ) %>%
  activate(nodes) %>%
  as_tibble() %>%
  filter(node_type == "stock") %>%
  mutate(
    cluster_group = sprintf("LOUVAIN_%03d", cluster_id)
    )

largecomp_clustered_tbl %>% glimpse()
```


# Construct Non-Transaction Graphs

We now want to construct a graph where we create a link between all products
that co-occur in a transaction, so that transactions are now represented by
the weights on the edges of the graph, rather than being nodes.

```{r construct_pairwise_graph, echo=TRUE}
pairwise_nodes_tbl <- tnx_data_tbl %>%
  filter(
    quantity > 0,
    price > 0,
    exclude == FALSE
    ) %>%
  mutate(
    stock_code = str_to_upper(stock_code)
    ) %>%
  arrange(stock_code)

pairwise_edges_tbl <- basket_data_tbl %>%
  select(basket_data) %>%
  unnest(basket_data) %>%
  count(from = one, to = two, name = "invoice_count")


pairwise_tblgraph <- tbl_graph(
  nodes    = pairwise_nodes_tbl,
  edges    = pairwise_edges_tbl,
  directed = FALSE,
  node_key = "stock_code"
)

pairwise_tblgraph %>% print()
```

## Extract Largest Component

We now run clustering on the largest component of this graph.

```{r extract_pairwise_largest_component, echo=TRUE}
pairwise_largecomp_tblgraph <- pairwise_tblgraph %>%
  mutate(
    component_id = group_components()
    ) %>%
  group_by(component_id) %>%
  mutate(component_size = n()) %>%
  ungroup() %>%
  convert(to_subgraph, component_size == max(component_size))

pairwise_largecomp_tblgraph %>% print()
```

We now run some community detection on this largest component as a way to
segment the product space.

```{r cluster_pairwise_largecomp, echo=TRUE}
pairwise_stockgroup_tbl <- pairwise_largecomp_tblgraph %>%
  mutate(
    cluster_id = group_louvain(weights = invoice_count)
    ) %>%
  activate(nodes) %>%
  as_tibble() %>%
  transmute(
    stock_code,
    cluster_label = sprintf("PAIRWISE_%03d", cluster_id)
    )

pairwise_additional_tbl <- pairwise_nodes_tbl %>%
  anti_join(pairwise_stockgroup_tbl, by = "stock_code") %>%
  transmute(
    stock_code,
    cluster_label = "PAIRWISE_OTHER"
    )

pairwise_groups_tbl <- list(
    pairwise_stockgroup_tbl,
    pairwise_additional_tbl
    ) %>%
  bind_rows() %>%
  inner_join(pairwise_nodes_tbl, by = "stock_code") %>%
  arrange(stock_code)

pairwise_groups_tbl %>% glimpse()
```


# R Environment

```{r show_session_info, echo=TRUE, message=TRUE}
sessioninfo::session_info()
```