forked from taylorvandoren/joinpoint-regression-tutorial
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathJoinpoint Program Analyses.R
173 lines (143 loc) · 8.26 KB
/
Joinpoint Program Analyses.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
### JOINPOINT REGRESSION TUTORIAL
### Guidelines for analyses in the Joinpoint Regression Program, built by the National Institutes of Health
# Taylor P. van Doren, ABD
# October 18, 2021
### IMPORTANT PRELUDE
# This file will go through the process of joinpoint regression analysis using the Joinpoint Regression Program
# Joinpoint Trend Analysis is a statistical software built by the National Cancer Institute of the
# National Institutes of health, Division of Cancer Control& Population Science. The program is completely
# open source and available for download at surveillance.cancer.gov/joinpoint. All you have to do is register
# for the software, and you will be sent a link with permission to download the program. It is perfect for
# researchers seeking to gain some insights into how rates change over time.
# This file will introdue the data and get it ready to import into the joinpoint program, but the bulk of the
# analysis will not be performed within this R file. Most of the code here will be devoted to creating
# visualizations of the data and analyses performed outside the file in the Joinpoint Regression Program.
# There is another file dedicated to logistic joinpoint analyses within this Github Repository. Each of these
# methods are useful, but they are different approaches, so use the one that serves you best.
### LOAD LIBRARIES
library(ggplot2)
library(ggsci)
### LOAD DATA
# This data is from the CDC National Center for Health Statistics National Vital Statistics System,
# the Age-Adjusted Death Rates for Selected Causes, Death Registration States, 1900-1932, and
# United States 1933-98.
# link: https://www.cdc.gov/nchs/nvss/mortality/hist293.htm
mortdata <- read.csv("mortality data.csv")
### PLOT THE DATA
ggplot(mortdata, aes(year, asdr, color = cod)) +
geom_point(size = 3.5, shape = 1) +
geom_smooth() +
theme_classic() +
xlab("Year") +
ylab("Age-Adjusted Death Rate (per 100,000)") +
scale_color_jco()
# Looking at the initial plot will really help us develop some intuition about the kinds of results we
# might hope to see from the regression analyses. Since this regression method identifies points of
# significant change in rates over a time period, and there are some clear points here that we will
# be looking out for when we run the regression.
# For example, it's clear that there was a major increase and then decrease of mortality rates from heart
# disease during the 20th century, and it appears as if the turning point was somewhere during the
# 1950s. We should keep our eyes out for identified joinpoints in that decade.
# The data are in the correct format for analyses in the joinpoint program, so we'll head over there to
# get the data analyzed.
### SEE THE PDF OF THE INSTRUCTIONS ON HOW TO RUN THESE ANALYSES IN THE JOINPOINT REGRESSION SOFTWARE.
# After analyses, we'll need to put together all the results and import them back to R so we can create
# some more appealing figures.
# Import all model fits
modelfits <- read.csv("model fits.csv")
# modelfits has many columns of data, but they are all arranged to follow the basic format:
# year
# cause of death observed mortality rate (per 100,000) <-- original data used to fit models
# cod.0: model fit with zero joinpoints
# cod.1: model fit with one joinpoint
# cod.2: model fit with two joinpoints
# cod.3: model fit with three joinpoints
# Below, we will plot each cause of death separately so we can easily see all the model fits.
# The best-fit model as determined by the software will appear as a solid line;
# all other model fits appear as dashed lines.
# ACCIDENTS
ggplot(modelfits, aes(x = year, y = accidents.obs)) +
geom_point(size = 3.5, shape = 21, col = "black", fill = "darkgrey") +
geom_line(aes(year, accidents.0), col = "#c91e63", size = 1.1, linetype = "dashed") +
geom_line(aes(year, accidents.1), col = "#9c27b0", size = 1.1, linetype = "dashed") +
geom_line(aes(year, accidents.2), col = "#ee5722", size = 1.1, linetype = "dashed") +
geom_line(aes(year, accidents.3), col = "#00bcd4", size = 1.5) +
theme_classic() +
xlab("Year") +
ylab("Age-Adjusted Death Rate (per 100,000)") +
ggtitle("Accidents")
# CANCER
ggplot(modelfits, aes(x = year, y = cancer.obs)) +
geom_point(size = 3.5, shape = 21, col = "black", fill = "darkgrey") +
geom_line(aes(year, cancer.0), col = "#c91e63", size = 1.1, linetype = "dashed") +
geom_line(aes(year, cancer.1), col = "#9c27b0", size = 1.1, linetype = "dashed") +
geom_line(aes(year, cancer.2), col = "#ee5722", size = 1.1, linetype = "dashed") +
geom_line(aes(year, cancer.3), col = "#00bcd4", size = 1.5) +
theme_classic() +
xlab("Year") +
ylab("Age-Adjusted Death Rate (per 100,000)") +
ggtitle("Cancer")
# HEART DISEASE
ggplot(modelfits, aes(x = year, y = heart.obs)) +
geom_point(size = 3.5, shape = 21, col = "black", fill = "darkgrey") +
geom_line(aes(year, heart.0), col = "#c91e63", size = 1.1, linetype = "dashed") +
geom_line(aes(year, heart.1), col = "#9c27b0", size = 1.1, linetype = "dashed") +
geom_line(aes(year, heart.2), col = "#ee5722", size = 1.1, linetype = "dashed") +
geom_line(aes(year, heart.3), col = "#00bcd4", size = 1.5) +
theme_classic() +
xlab("Year") +
ylab("Age-Adjusted Death Rate (per 100,000)") +
ggtitle("Heart Disease")
# PNEUMONIA & INFLUENZA
ggplot(modelfits, aes(x = year, y = pi.obs)) +
geom_point(size = 3.5, shape = 21, col = "black", fill = "darkgrey") +
geom_line(aes(year, pi.0), col = "#c91e63", size = 1.1, linetype = "dashed") +
geom_line(aes(year, pi.1), col = "#9c27b0", size = 1.5) +
geom_line(aes(year, pi.2), col = "#ee5722", size = 1.1, linetype = "dashed") +
geom_line(aes(year, pi.3), col = "#00bcd4", size = 1.1, linetype = "dashed") +
geom_hline(yintercept = 0, linetype = "dashed") +
theme_classic() +
xlab("Year") +
ylab("Age-Adjusted Death Rate (per 100,000)") +
ggtitle("Influenza & Pneumonia (P&I)")
# STROKE
ggplot(modelfits, aes(x = year, y = stroke.obs)) +
geom_point(size = 3.5, shape = 21, col = "black", fill = "darkgrey") +
geom_line(aes(year, stroke.0), col = "#c91e63", size = 1.1, linetype = "dashed") +
geom_line(aes(year, stroke.1), col = "#9c27b0", size = 1.1, linetype = "dashed") +
geom_line(aes(year, stroke.2), col = "#ee5722", size = 1.1, linetype = "dashed") +
geom_line(aes(year, stroke.3), col = "#00bcd4", size = 1.5) +
theme_classic() +
xlab("Year") +
ylab("Age-Adjusted Death Rate (per 100,000)") +
ggtitle("Stroke")
# TUBERCULOSIS
ggplot(modelfits, aes(x = year, y = tb.obs)) +
geom_point(size = 3.5, shape = 21, col = "black", fill = "darkgrey") +
geom_line(aes(year, tb.0), col = "#c91e63", size = 1.1, linetype = "dashed") +
geom_line(aes(year, tb.1), col = "#9c27b0", size = 1.1, linetype = "dashed") +
geom_line(aes(year, tb.2), col = "#ee5722", size = 1.1, linetype = "dashed") +
geom_line(aes(year, tb.3), col = "#00bcd4", size = 1.5) +
geom_hline(yintercept = 0, linetype = "dashed") +
theme_classic() +
xlab("Year") +
ylab("Age-Adjusted Death Rate (per 100,000)") +
ggtitle("Tuberculosis (TB)")
# Let's make a conglomerate plot to put all the best-fit models together with the fitted data
ggplot(modelfits) +
geom_point(aes(year, accidents.obs), size = 3, shape = 21, color = "#5d6d7e", fill = "grey") +
geom_point(aes(year, cancer.obs), size = 3, shape = 21, color = "#5d6d7e", fill = "grey") +
geom_point(aes(year, heart.obs), size = 3, shape = 21, color = "#5d6d7e", fill = "grey") +
geom_point(aes(year, pi.obs), size = 3, shape = 21, color = "#5d6d7e", fill = "grey") +
geom_point(aes(year, stroke.obs), size = 3, shape = 21, color = "#5d6d7e", fill = "grey") +
geom_point(aes(year, tb.obs), size = 3, shape = 21, color = "#5d6d7e", fill = "grey") +
geom_line(aes(year, accidents.3), col = "#b8396b", size = 1.5) +
geom_line(aes(year, cancer.3), col = "#ffd1d7", size = 1.5) +
geom_line(aes(year, heart.3), col = "#fff5cc", size = 1.5) +
geom_line(aes(year, pi.1), col = "#76bae0", size = 1.5) +
geom_line(aes(year, stroke.3), col = "#b28f81", size = 1.5) +
geom_line(aes(year, tb.3), col = "#54483e", size = 1.5) +
theme_classic() +
xlab("Year") +
ylab("Age-Adjusted Moratlity Rate (per 100,000)") +
ggtitle("Major Causes of Death (1900-1998) with Best-Fits")