Last updated 2020-04-24
## ----setup, include=FALSE------------------------------------------------------------------------------------
library(dplyr)
library(readxl)
library(reshape2)
library(ggplot2)
library(plotly)
library(kableExtra)
library(RColorBrewer)
library(scales)
# Eval 4th lag value?
eval_lag4 = "F"
# date origin
origin = as.Date("1970-01-01")
# Date of State "Stay Home, Stay Healthy" order
stateOrderDate <- as.Date("2020-03-23")
# set up min and max sensitivity ranges for FRs
# use +/- (Verity range/2)
# IFR: 0.009 +/- .0.005
vintIFR = (0.0133-0.00389)/2
# CFR: .018 +/- .0015
vintCFR = (0.0153-0.0123)/2
# Population of King County
# source: https://www.census.gov/quickfacts/fact/table/kingcountywashington/PST045219
popn <- 2252782
## ----age_specific_frs----------------------------------------------------------------------------------------
# now taken from Verity, et al. Lancet, 3/30/2020
# NOTE: these are modified from percent to rate scale.
estFRs <- data.frame(readxl::read_excel(here::here("data", "estFRs.xlsx"),
n_max = 9),
stringsAsFactors = F) %>%
mutate(IFR = IFR/100, CFR = CFR/100)
# Display FRs
## ----popdist, message=F, warning=F---------------------------------------------------------------------------
# scrape data if necessary
if(!file.exists(here::here("data","WA_age_pop_distn.Rda"))){
WA_age_pop_distn <- rio::import("https://www.ofm.wa.gov/sites/default/files/public/dataresearch/pop/asr/agesex/ofm_pop_age_sex_postcensal_2010_2019_s.xlsx",
which = "Population")
save(WA_age_pop_distn, file=here::here("data", "WA_age_pop_distn.Rda"))
} else{
load(here::here("data","WA_age_pop_distn.Rda"))
}
# function to calculate county pop distn
calc_age_distn <- function(dataf = WA_age_pop_distn, county = "King"){
agedist <- dataf %>%
select("County" = "Area Name",
"AgeGrp" = "Age Group",
"Popn2019" = "2019 Total") %>%
subset(County == county & AgeGrp != "Total") %>%
mutate(age.index = c(rep(1:9, c(2, 6, rep(2,7))))) %>%
select(Popn2019, age.index) %>%
group_by(age.index) %>%
summarise(Pop2019 = round(sum(as.numeric(Popn2019)),0)) %>%
mutate(Age.Range = c("0-9","10-19","20-29","30-39",
"40-49", "50-59", "60-69",
"70-79","80+")) %>%
mutate(Age.Frac = Pop2019/sum(Pop2019)) %>%
select(Age.Range, Pop2019, Age.Frac)
return(agedist)
}
# King Co
KC_age_distn <- calc_age_distn(county = "King")
# WA state total
WA_age_distn <- calc_age_distn(county = "Washington")
# Other WA (WA - KC)
OW_age_distn <- data.frame("Age.Range" = WA_age_distn$Age.Range,
"Pop" = WA_age_distn$Pop2019 - KC_age_distn$Pop2019) %>%
mutate("Age.Frac" = Pop/sum(Pop)) %>%
mutate_if(is.factor, as.character)
# Compare age distn's
## ----aaFRs, warning=F----------------------------------------------------------------------------------------
## for now, calculate KC, WA and outside KC separately
# King Co ------------------------------------
# Function to create FRs
KC_FR_table <- left_join(estFRs, KC_age_distn,
by = "Age.Range",
stringsAsFactors=F)
kcIFR <- sum(KC_FR_table$IFR * KC_FR_table$Age.Frac)
kcCFR <- sum(KC_FR_table$CFR * KC_FR_table$Age.Frac)
## set up min and max sensitivity range, use +/-Verity range/2
## 0.009 +/- .0.005
kcminIFR <- kcIFR - vintIFR
kcmaxIFR <- kcIFR + vintIFR
## .018 +/- .0015
kcminCFR <- kcCFR - vintCFR
kcmaxCFR <- kcCFR + vintCFR
## nDx fraction
kcndxFrac <- 1 - kcIFR/kcCFR
# WA state total ---------------------------------
WA_FR_table <- left_join(estFRs, WA_age_distn,
by = "Age.Range",
stringsAsFactors=F)
waIFR <- sum(WA_FR_table$IFR * WA_FR_table$Age.Frac)
waCFR <- sum(WA_FR_table$CFR * WA_FR_table$Age.Frac)
## 0.009 +/- .0.005
waminIFR <- waIFR - vintIFR
wamaxIFR <- waIFR + vintIFR
## .018 +/- .0015
waminCFR <- waCFR - vintCFR
wamaxCFR <- waCFR + vintCFR
## nDx fraction
wandxFrac <- 1 - waIFR/waCFR
# Other WA (WA - KC) -------------------------------
OW_FR_table <- left_join(estFRs, OW_age_distn,
by = "Age.Range",
stringsAsFactors=F)
owIFR <- sum(OW_FR_table$IFR * OW_FR_table$Age.Frac)
owCFR <- sum(OW_FR_table$CFR * OW_FR_table$Age.Frac)
## 0.009 +/- .0.005
owminIFR <- owIFR - vintIFR
owmaxIFR <- owIFR + vintIFR
## .018 +/- .0015
owminCFR <- owCFR - vintCFR
owmaxCFR <- owCFR + vintCFR
## nDx fraction
owndxFrac <- 1 - owIFR/owCFR
## ----compareFRtable------------------------------------------------------------------------------------------
frtable <- data.frame("Rate" = c("IFR", NA, NA, "CFR", "", ""),
"Estimate" = rep(c("lower", "MIDPOINT", "upper"),2),
"WA State" = c(waminIFR, waIFR, wamaxIFR,
waminCFR, waCFR, wamaxCFR),
"King County" = c(kcminIFR, kcIFR, kcmaxIFR,
kcminCFR, kcCFR, kcmaxCFR),
"Outside KC" = c(owminIFR, owIFR, owmaxIFR,
owminCFR, owCFR, owmaxCFR)
)
# Display frtable
## ----calcallcounties, warning=F------------------------------------------------------------------------------
all_counties <- unique(WA_age_pop_distn$"Area Name")
all_counties <- all_counties[all_counties !="."]
county_FRs <- data.frame("County" = all_counties,
"IFR" = NA,
"CFR" = NA,
"nDx" = NA,
stringsAsFactors = F)
for(co_index in 1:length(all_counties)){
co_age_distn <- calc_age_distn(county = all_counties[co_index])
county_fr_table <- left_join(estFRs, co_age_distn, by = "Age.Range")
county_FRs[co_index, "IFR"] <- sum(county_fr_table$IFR * county_fr_table$Age.Frac)
county_FRs[co_index, "CFR"] <- sum(county_fr_table$CFR * county_fr_table$Age.Frac)
county_FRs[co_index, "nDx"] <- 1 - county_FRs[co_index,"IFR"]/county_FRs[co_index,"CFR"]
}
## ---Surveillance Data-----------------------------------------------------------------------------------
# Note -- always read this in fresh
# wdrsCases <- rio::import("https://www.doh.wa.gov/Portals/1/Documents/1600/coronavirus/data-tables/PUBLIC-CDC-Event-Date-SARS.xlsx",
# which = "Cases")
#
#
# wdrsDeaths <- rio::import(
# "https://www.doh.wa.gov/Portals/1/Documents/1600/coronavirus/data-tables/PUBLIC-CDC-Event-Date-SARS.xlsx",
# which = "Deaths")
wdrsCases <- readxl::read_excel("data/PUBLIC-CDC-Event-Date-SARS.xlsx", sheet = "Cases")
wdrsDeaths <- readxl::read_excel("data/PUBLIC-CDC-Event-Date-SARS.xlsx", sheet = "Deaths")
lastRepDate <- as.Date(wdrsCases$Day[nrow(wdrsCases)])
# Last week (for reporting delays)
lastWeek <- seq(lastRepDate-7, lastRepDate, 1)
# Temp adjustment for unobserved death date anomalies
## First recorded death
firstDeathDate <- wdrsDeaths %>%
filter(Deaths==1) %>%
summarize(min(Day))
firstDeathDate <- as.Date(as.character(firstDeathDate))
# obslag <- as.numeric(as.Date("2020-02-29") - tmpfirstDeathDate)
# startrow <- which(wdrsDeaths$Day == as.character(tmpfirstDeathDate))[1]
#
# test <- wdrsDeaths
#
# # loop not working yet
# for(i in startrow:obslag) {
# for(j in 3:9) {
# test[i,3:9] <- dplyr::lead(test[i,j],obslag) =
# dplyr::lead(test[i,3:9],obslag) + test[i,3:9]
# }
# }
# Create long version of Cases
temp <- wdrsCases %>%
select(-dtm_updated) %>%
rename(Date = Day,
ncall = NewPos_All,
nc0x19 = "Age 0-19",
nc20x39 = "Age 20-39",
nc40x59 = "Age 40-59",
nc60x79 = "Age 60-79",
nc80 = "Age 80+",
ncunk = "Positive UnkAge") %>%
group_by(County) %>%
mutate(ccall = cumsum(ncall),
cc0x19 = cumsum(nc0x19),
cc20x39 = cumsum(nc20x39),
cc40x59 = cumsum(nc40x59),
cc60x79 = cumsum(nc60x79),
cc80 = cumsum(nc80),
ccunk = cumsum(ncunk)) %>%
ungroup()
library(data.table)
wdrsCasesLong <- melt.data.table(as.data.table(temp),
id = c("County", "Date"),
measure = patterns(nCases="^nc", cCases="^cc"),
variable.name = "ageGrp")
# Create long version of Deaths
temp <- wdrsDeaths %>%
select(-dtm_updated) %>%
rename(Date = Day,
ndall = Deaths,
nd0x19 = "Age 0-19",
nd20x39 = "Age 20-39",
nd40x59 = "Age 40-59",
nd60x79 = "Age 60-79",
nd80 = "Age 80+",
ndunk = "Positive UnkAge") %>%
group_by(County) %>%
mutate(cdall = cumsum(ndall),
cd0x19 = cumsum(nd0x19),
cd20x39 = cumsum(nd20x39),
cd40x59 = cumsum(nd40x59),
cd60x79 = cumsum(nd60x79),
cd80 = cumsum(nd80),
cdunk = cumsum(ndunk)) %>%
ungroup()
wdrsDeathsLong <- melt.data.table(as.data.table(temp),
id = c("County", "Date"),
measure = patterns(nDeaths="^nd", cDeaths="^cd"),
variable.name = "ageGrp")
wdrsLong <- left_join(wdrsCasesLong, wdrsDeathsLong,
by = c("County","Date", "ageGrp")) %>%
mutate(ageGrp = as.numeric(ageGrp),
ageMid = recode(ageGrp,
`2` = 10,
`3` = 30,
`4` = 50,
`5` = 70,
`6` = 85,
.default = NA_real_),
ageBin = ifelse(!is.na(ageMid), ifelse(ageMid < 51, 0, 1), NA),
agecat = recode(ageGrp,
`1` = "all",
`2` = "a0x19",
`3` = "a20x39",
`4` = "a40x59",
`5` = "a60x79",
`6` = "a80",
`7` = "unk"))
# WA df for age-specific trends
WAbyAge <- wdrsLong %>%
mutate(Date = as.Date(Date)) %>%
group_by(Date, agecat) %>%
summarize(nCases = sum(nCases),
nDeaths = sum(nDeaths),
cCases = sum(cCases),
cDeaths = sum(cDeaths),
ageMid = mean(ageMid),
ageBin = mean(ageBin)) %>%
tidyr::replace_na(list(nCases = 0, cCases = 0, nDeaths = 0, cDeaths = 0)) %>%
mutate(nCaseFrac = nCases/nCases[agecat == "all"],
cCaseFrac = cCases/cCases[agecat == "all"],
nDeathFrac = nDeaths/nDeaths[agecat == "all"],
cDeathFrac = cDeaths/cDeaths[agecat == "all"]) %>%
arrange(Date, agecat)
KCbyAge <- wdrsLong %>%
filter(County == "King County") %>%
mutate(Date = as.Date(Date)) %>%
select(Date, agecat, nCases, nDeaths, cCases, cDeaths, ageMid, ageBin) %>%
group_by(Date) %>%
tidyr::replace_na(list(nCases = 0, cCases = 0, nDeaths = 0, cDeaths = 0)) %>%
mutate(nCaseFrac = nCases/nCases[agecat == "all"],
cCaseFrac = cCases/cCases[agecat == "all"],
nDeathFrac = nDeaths/nDeaths[agecat == "all"],
cDeathFrac = cDeaths/cDeaths[agecat == "all"]) %>%
arrange(Date, agecat)
OCbyAge <- wdrsLong %>%
filter(County != "King County") %>%
mutate(Date = as.Date(Date)) %>%
group_by(Date, agecat) %>%
summarize(nCases = sum(nCases),
nDeaths = sum(nDeaths),
cCases = sum(cCases),
cDeaths = sum(cDeaths),
ageMid = mean(ageMid),
ageBin = mean(ageBin)) %>%
tidyr::replace_na(list(nCases = 0, cCases = 0, nDeaths = 0, cDeaths = 0)) %>%
mutate(nCaseFrac = nCases/nCases[agecat == "all"],
cCaseFrac = cCases/cCases[agecat == "all"],
nDeathFrac = nDeaths/nDeaths[agecat == "all"],
cDeathFrac = cDeaths/cDeaths[agecat == "all"])
WAdata <- WAbyAge %>%
filter(agecat != "all") %>%
select(Date, nCases, cCases, nDeaths, cDeaths) %>%
group_by(Date) %>%
summarize(wa_nCases = sum(nCases),
wa_cCases = sum(cCases),
wa_nDeaths = sum(nDeaths),
wa_cDeaths = sum(cDeaths))
KCdata <- KCbyAge %>%
filter(agecat != "all") %>%
select(Date, nCases, cCases, nDeaths, cDeaths) %>%
group_by(Date) %>%
summarize(kc_nCases = sum(nCases),
kc_cCases = sum(cCases),
kc_nDeaths = sum(nDeaths),
kc_cDeaths = sum(cDeaths))
OCdata <- OCbyAge %>%
filter(agecat != "all") %>%
select(Date, nCases, cCases, nDeaths, cDeaths) %>%
group_by(Date) %>%
summarize(oc_nCases = sum(nCases),
oc_cCases = sum(cCases),
oc_nDeaths = sum(nDeaths),
oc_cDeaths = sum(cDeaths))
# note that factor levels are determined by position, make sure the region vars
# are always in WA, KC, OC position
comp <- WAdata %>%
left_join(KCdata, by="Date") %>%
left_join(OCdata, by="Date") %>%
mutate(kcwa.case.ratio = case_when(wa_cCases > 0 ~ kc_cCases/wa_cCases),
kcwa.death.ratio = case_when(wa_cDeaths > 0 ~ kc_cDeaths/wa_cDeaths),
wa_dthpercase = case_when(wa_cCases > 0 ~ 100*wa_cDeaths/wa_cCases),
kc_dthpercase = case_when(kc_cCases > 0 ~ 100*kc_cDeaths/kc_cCases),
oc_dthpercase = case_when(oc_cCases > 0 ~ 100*oc_cDeaths/oc_cCases)
)
comp.long <- reshape2::melt(comp, id="Date")
# Most recent estimates of deaths per dxcase
KCDeathPerCase <- comp$kc_cDeaths[nrow(comp)] / comp$kc_cCases[nrow(comp)]
OWDeathPerCase <- comp$oc_cDeaths[nrow(comp)] / comp$oc_cCases[nrow(comp)]
## First observed death
firstDeathDate <- NA
for (i in 1:dim(KCdata)[1]){
if(is.na(firstDeathDate) & KCdata$kc_cDeaths[i] !=0) {
firstDeathDate = KCdata$Date[i]
}
}
## ----calcLogSlope, warning = F------------------------------------------------------------------------------------
# for ease of analysis and plotting, construct new df
kc_cCounts <- KCdata %>%
select("Date", "kc_cCases", "kc_cDeaths") %>%
rename(cCases = kc_cCases,
cDeaths = kc_cDeaths)
# some have test counts, others don't
nvars = ncol(kc_cCounts)-1
## For log plots regress log(count) on date, and predict values for two ranges:
### 1. the 3 wks before the state order (roughly, 1st death to state order)
range1 <- c(stateOrderDate-21, stateOrderDate)
### 2. Last 2 wks of data -1 wk, (to assess flattening, but avoid reporting delays)
range2 <- c(lastRepDate-21, lastRepDate-7)
# start and end row numbers for range1 regressions
startrow1 <- rep(which(kc_cCounts$Date == range1[1]), nvars)
endrow1 <- rep(which(kc_cCounts$Date == range1[2]), nvars)
# start and end row numbers for range2 regressions
startrow2 <- rep(which(kc_cCounts$Date == range2[1]), nvars)
endrow2 <- rep(which(kc_cCounts$Date == range2[2]), nvars)
logPreds <- list()
logFits <- list()
for(j in 1:2) {
startrow <- get(paste0("startrow",j))
endrow <- get(paste0("endrow",j))
for(i in 1:nvars) {
x <- startrow[i]:endrow[i]
y <- kc_cCounts[x,][[i + 1]]
# omit missing data
x <- x[!is.na(y)]
y <- y[!is.na(y)]
lm_fit <- lm(log(y) ~ x)
index = ifelse(j==1, i, i+j)
# print(index)
logFits[[index]] <- lm_fit
logPreds[[index]] <- data.frame(pred = exp(predict(lm_fit)), x = kc_cCounts[x,1])
}
}
names(logFits) <- names(logPreds) <- c("Cases_beforeSO", "Deaths_beforeSO",
"Cases_lst2wks", "Deaths_lst2wks")
## ----backcalcs-----------------------------------------------------------------------------------------------
# Inputs (calculated above)
estIFR = c(kcminIFR, kcIFR, kcmaxIFR)
estCFR = c(kcminCFR, kcCFR, kcmaxCFR)
lag = c(16, 17, 19)
# Dimensions (for easy calcs)
days <- dim(KCdata)[1]
frs <- length(estCFR)
lags <- length(lag)
# Setup arrays for total, ndx and tot
## leave room for date and lag value on each lag layer
totArray <- dxArray <- ndxArray <- fdxArray <-
unobstotArray <- unobsdxArray <- obsdxbylagArray <-
array(NA, c(days, frs+2, lags))
# Observable time window
## Last report date, and reporting delay
delay <- as.numeric(Sys.Date() - lastRepDate)
## Last prediction, given lag dates (vector)
lastEstDateVec <- as.Date(lastRepDate - lag)
# last prediction date for MIDPOINT estimates
lastEstDate <- lastEstDateVec[2]
# Calculations:
# Estimates only calcluated after the 1st observed death,
# date is in 1st col, lag in last col, for each layer k
## so j loop is over cols 2, 3, 4
# note we calculate the obsdxbylag here, even though it doesn't
## vary by lag, and modify it later.
for (k in 1:lags){
for (j in 2:(frs+1)){
for(i in 1:(days-lag[k])) {
if(dplyr::lead(KCdata$kc_cDeaths,lag[k])[i] != 0){
totArray[i,j,k] <- round(dplyr::lead(KCdata$kc_cDeaths,lag[k])[i]/estIFR[j-1],0)
dxArray[i,j,k] <- round(dplyr::lead(KCdata$kc_cDeaths,lag[k])[i]/estCFR[j-1],0)
ndxArray[i,j,k] <- totArray[i,j,k] - dxArray[i,j,k]
fdxArray[i,j,k] <- dxArray[i,j,k] - KCdata$kc_cCases[i]
unobstotArray[i,j,k] <-
(totArray[i,j,k] - KCdata$kc_cCases[i])/totArray[i,j,k]
unobsdxArray[i,j,k] <- (dxArray[i,j,k] - KCdata$kc_cCases[i])/dxArray[i,j,k]
}
}
for (i in 1:days) {
if(KCdata$kc_cDeaths[i] != 0){
obsdxbylagArray[i,j,k] <- estCFR[j-1] * KCdata$kc_cCases[i]/KCdata$kc_cDeaths[i]
}
}
}
totArray[,1,k] <- dxArray[,1,k] <- ndxArray[,1,k] <- fdxArray[,1,k] <-
unobstotArray[,1,k] <- unobsdxArray[,1,k] <- obsdxbylagArray[,1,k] <-
KCdata$Date
totArray[,frs+2,k] <- dxArray[,frs+2,k] <- ndxArray[,frs+2,k] <- fdxArray[,frs+2,k] <-
unobstotArray[,frs+2,k] <- unobsdxArray[,frs+2,k] <- obsdxbylagArray[,frs+2,k] <-
lag[k]
}
# Reshape function for plotting
# For ribbons, need to keep CFRs as cols to set ymin and ymax,
# lag layers are appended in blocks
rshp <- function(x, rate) {
y <- aperm(x, c(1,3,2))
dim(y) <- c(days*lags, frs+2)
y <- data.frame(y)
colnames(y) <- c("Date",
paste0("kcmin",rate),
paste0("kc", rate),
paste0("kcmax", rate),
"Lag")
# Dates are persnickety, restore format
y$Date <- as.Date(y$Date, origin = origin)
return(y)
}
#debug(rshp)
tot <- rshp(totArray, "IFR")
ndx <- rshp(ndxArray, "FRs")
fdx <- rshp(fdxArray, "CFR")
unobstotFrac <- rshp(unobstotArray, "IFR")
unobsdxFrac <- rshp(unobsdxArray, "CFR")
obsdxbylagFrac <- rshp(obsdxbylagArray, "CFR")
obsdxbylagFrac <- obsdxbylagFrac[1:days,] # b/c no variation by lag
# last observed obsdxbylagFrac
lastobsdxbylagFrac <- obsdxbylagFrac$kcCFR[days]
## ---- keytableCalc, include=F--------------------------------------------------------------------------------
# Total
test <- tot$kcIFR[tot$Date == lastEstDate & tot$Lag == lag[2]]
tupper <- tot$kcminIFR[tot$Date == lastEstDate & tot$Lag == lag[2]]
tlower <- tot$kcmaxIFR[tot$Date == lastEstDate & tot$Lag == lag[2]]
# Never Diagnosed
nest <- ndx$kcFRs[ndx$Date == lastEstDate & ndx$Lag == lag[2]]
nupper <- ndx$kcminFRs[ndx$Date == lastEstDate & ndx$Lag == lag[2]]
nlower <- ndx$kcmaxFRs[ndx$Date == lastEstDate & ndx$Lag == lag[2]]
# Unobserved Fractions
uesttot <- percent(unobstotFrac$kcIFR[unobstotFrac$Date == lastEstDate &
unobstotFrac$Lag == lag[2]], .1)
uuppertot <- percent(unobstotFrac$kcminIFR[unobstotFrac$Date == lastEstDate &
unobstotFrac$Lag == lag[2]], .1)
ulowertot <- percent(unobstotFrac$kcmaxIFR[unobstotFrac$Date == lastEstDate &
unobstotFrac$Lag == lag[2]], .1)
uestdx <- percent(unobsdxFrac$kcCFR[unobsdxFrac$Date == lastEstDate &
unobsdxFrac$Lag == lag[2]], .1)
uupperdx <- percent(unobsdxFrac$kcminCFR[unobsdxFrac$Date == lastEstDate &
unobsdxFrac$Lag == lag[2]], .1)
ulowerdx <- percent(unobsdxFrac$kcmaxCFR[unobsdxFrac$Date == lastEstDate &
unobsdxFrac$Lag == lag[2]], .1)
cumInfFrac = c(tlower, test, tupper)/popn
keytable <- data.frame(Lower = c(NA,
comma(tlower),
percent(cumInfFrac[1],.1),
comma(nlower),
ulowertot,
ulowerdx),
Midpoint = c(comma(KCdata$kc_cCases[days]),
comma(test),
percent(cumInfFrac[2], .1),
comma(nest),
uesttot,
uestdx),
Upper = c(NA,
comma(tupper),
percent(cumInfFrac[3], .1),
comma(nupper),
uuppertot,
uupperdx),
row.names = c("Observed Cumulative Cases",
"Total Infections (estimated)",
"Cumulative Infection Fraction (estimated)",
"Never Dx Infections (estimated)",
"Unobserved % of Total Infections",
"Unobserved % of Dx Infections"))
We use a simple backcalculation approach to estimate the total number of Covid-19 infections in King County, and several other quantities of interest, with a sensitivity analysis for key parameters. Surveillance data are taken from the data released by WA DOH (as an excel file download from https://www.doh.wa.gov/emergencies/coronavirus)
Bottom line: There is consistent evidence from surveillance data that we have “bent the curve” substantially, but there is also evidence that the epidemic is shifting into younger populations. This shift will reduce mortality counts and change the correlation between overall mortality counts and overall infection counts. Under these conditions, an age-stratified back-calculation is required for accurate estimation of total infections.
Our key findings are reported on the tabs below.
vlinedates = c(firstDeathDate, stateOrderDate) # plotly needs as.integer
keyplot <- data.frame(date = KCdata$Date,
EstNeverDx = ndx$kcFRs[ndx$Lag == lag[2]],
EstFutureDx = fdx$kcCFR[tot$Lag == lag[2]],
DxLiving = KCdata$kc_cCases-KCdata$kc_cDeaths,
DxDeaths = KCdata$kc_cDeaths)
keyplotLong <- melt(keyplot,id="date",
value.name = "Infections",
variable.name = "Category")
yrng = range(tot$kcIFR[tot$Lag == lag[2]], na.rm=T)
# secondary axis shows cumInfFrac value as %
summary.plot <- ggplot(data = keyplotLong) +
geom_area(aes(y = Infections, x=date,
fill=Category),
na.rm = T, alpha = 0.5) +
geom_vline(xintercept = as.integer(vlinedates),
size = .5,
color = "gray") +
scale_y_continuous(labels = comma,
# breaks = seq(0, 40000, 5000),
sec.axis = sec_axis(~ . /popn,
labels = scales::percent_format(accuracy = .1),
name = "Cumulative Infection Fraction (%)")) +
scale_fill_discrete(labels = c("Never Dx", "Will be Dx", "Dx Living", "Dx Deaths")) +
# xlim(as.Date("2020-01-28"), lastRepDate) +
annotate("text",
x=firstDeathDate, y=yrng[2],
label= "First Death",
hjust = 1, vjust = 1, size = 3,
color = "darkgray") +
annotate("text",
x=stateOrderDate, y=yrng[2],
label= "State Order",
hjust = 1, vjust = 1, size = 3,
color = "darkgray") +
annotate("text",
x=lastEstDate,
y= yrng[2],
hjust = 0,
label= "Mortality lag period", size=3,
color = "darkgray") +
labs(title = "Midpoint Estimates of Cumulative Total Infections by Diagnosis Category",
subtitle = "Estimates depend on CFR; Cumulative infection fraction on right axis",
x = "Date",
y = "Infection Count")
plot(summary.plot)
Surveillance data counts
As of 2020-04-17, Covid-19 surveillance data reports for King County show:
Cumulative case count: 5,024, 43% of the WA State total.
Cumulative death count: 339, 54% of the WA State total.
Death rate per 100 cases: 6.7, 60% higher than the average rate for other counties in WA.
Estimates of total infections
Our midpoint estimates of the true number of infections are shown below, with sensitivity ranges in parentheses. Given the mortality lag period for this backcalculation, and reporting delays, our most recent estimates are for 2020-03-31. At that time, the cumulative number of cases in King County was 3,578
Estimated cumulative total infections: 44,803 (27,625, 118,469)
Estimated cumulative fraction of people infected to date: 2.0% (1.2%, 5.2%)
Estimated fraction of infections that are unobserved: 92.0% (87.0%, 97.0%)
Estimated fraction of infections that will never be diagnosed: 48.4% (constant)
The epidemic curve appears to be “flattening”, but…
In this backcalculation approach, the estimated epidemic curve is extrapolated from the observed mortality curve, so it is the dramatic decline in daily mortality counts that is generating this “flattening” of the estimated epidemic curve. Flattening is what everyone wants to see, and is a key benchmark for lifting social distancing restrictions. So, a word of caution is particularly appropriate.
Mortality counts are subject to both reporting delays and underreporting. Declines and flattening should be discounted accordingly.
The age-composition of the infected population is changing, due to early Covid-19 clustering in long term care facilities. In this context, declines in mortality may not imply a corresponding reduction in overall growth of infections.
At any point in time, over 90% of those infected have not been diagnosed
Our methods also estimate that roughly half of infections will never be diagnosed.
The high fraction of unobserved infections is not necessarily a problem for individuals, as reports suggest that asymptomatic infection may be common, and the complications of COVID-19 that lead to hospitalization are below 5% for those under 50 years old (Verity, et al.). But at the population level, this does suggest that unobserved infections are a significant fraction of the caseload at any point in time.
These unobserved infections likely contribute to ongoing transmission, which poses a problem for ongoing symptom-based control measures, including testing and contact tracing.
The goal of this analysis is to estimate the cumulative total number of infected persons over time for COVID-19 and related quantities in King County, Washington. We employ a simple back-calculation approach.
The intuition for back-calculation is that current cumulative mortality counts are based on persons who were infected earlier in time: so we can scale up the death counts we observe today by the (inverse of the) fatality rate in order to estimate the cumulative number of infected persons implied in the past. This method is designed to estimate cumulative infected persons in the past, not to predict case trajectories in the future.
Here, we use a 2-step approach:
Backcalculate the cumulative total infections at each point in time (Total) from the infection fatality rate (IFR) and surveillance data on the cumulative death counts in subsequent period. This also provides an estimate of the cumulative infection prevalence over time.
Backcalculate the cumulative infections that will eventually be diagnosed at each point in time (Dx) from the case fatality rate (CFR) and the death count data. The number of never diagnosed infections (nDx) are obtained by subtracting the Dx from the Total.
The method relies on three external parameter estimates: the lag time from infection to death, the IFR, and the CFR. We conduct sensitivity analyses on all of these parameters, so our estimates are presented along with a sensitivity range. Details are provided at the end of the report in the Backcalculation Methodology section.
A note on terminology: we use the term “infections” or “infected persons” to refer to the unobserved true values in the population that we are estimating, and we reserve the term “cases” to refer to the observed diagnosed cases that are captured in surveillance data over time. All of the estimates are presented as cumulative counts; we examine new counts per day only for the surveillance data.
Questions and comments welcome. You can leave feedback as an issue on the GitHub repository: https://github.com/statnet/CovidBackCalc. Or, if that’s not your style, just send me an email.
We start by plotting the surveillance data: cumulative diagnosed case and death counts, and new counts per day. In these plots, the “State Order” refers to the WA Governor’s “Stay Home, Stay Healthy” order issued on 3/23/2020.
yrng <- range(kc_cCounts$cCases)
kcldf <- reshape2::melt(kc_cCounts, id="Date")
p <- ggplot(data = kcldf,
aes(x=Date,
y=value,
color=variable,
group=variable)) +
geom_line() + geom_point(alpha = 0.5) +
geom_vline(xintercept = as.integer(vlinedates),
size = .5,
color = "gray") +
annotate("text",
x=firstDeathDate-9,
y= yrng[2],
label= "First Death",
hjust = 1, vjust = 1, size = 3,
color = "darkgray") +
annotate("text",
x=stateOrderDate-9,
y= yrng[2],
label= "State Order",
hjust = 1, vjust = 1, size = 3,
color = "darkgray") +
labs(x = "Date", y = "Number",
title = "King County Cumulative Case and Death Counts",
color = "Count",
caption = "Data Source: PHSKC Coronavirus Dashboard")
ggplotly(p) #plotly doesn't respect hjust :\
The log plot clearly shows we have bent both curves. We compare the growth rates in the 3 weeks before the state order, to the 2 week period 1 week before the last reporting date. We eliminate that last week to avoid artifacts due to reporting delays.
Neither curve has hit zero growth, but the epidemic is beginning to come under control. Still, some caution should be exercised before making inferences. Considerations are listed below the plot.
p <- ggplot(data = kcldf,
aes(x=Date,
y=value,
color=variable, group=variable)) +
ylim(0,10000) +
scale_y_log10(breaks=c(1,10,100,1000,10000),
labels= scales::comma_format(c(1,10,100,1000,10000))) +
scale_color_discrete(labels = c("Cases", "Deaths")) +
labs(x = "Date",
y = "Number of Infections (Log scaling)",
title = "Log Cumulative Counts: Cases and Deaths",
color = "Count")
# add log relevant lines
for(pow in 0:3)
{
for(level in 1:10)
{
val <- (10^pow)*level
p <- p + geom_hline(yintercept = val,
color="gray80", size=0.15)
}
}
# plot the data
p <- p +
geom_line() +
geom_point(alpha=0.5)
# plot the regression lines on top of the data
# nvars (2 or 3) x 2 periods
index = nvars*2
for(i in 1:index)
{
p <- p +
geom_line(color = "gray27", data = logPreds[[i]],
aes(x=Date, y=pred),
size=0.5,
inherit.aes=FALSE)
}
# add labels for growth rates, with manual positioning
# index = {1,2} for first period, {3,4} for second
ypos <- c(1100,30,10000,600)
xpos <- c(rep(range1[1]+7, 2), rep(range2[1]+5, 2))
for(i in 1:index) {
p <- p +
annotate(geom="text", size=2.5,
label=paste("growth/day\n",
round(100*(exp(logFits[[i]]$coefficients[["x"]]) - 1),1),
"%",
sep = ""),
y = ypos[i],
x = xpos[i],
hjust = 0)
}
# add date context
p <- p +
geom_vline(xintercept = as.integer(vlinedates),
size = .5,
color = "gray") +
# annotate("text",
# x=firstDeathDate-9,
# y= yrng[2]*1.3,
# label= "First Death",
# hjust = 1, vjust = 1, size = 3,
# color = "darkgray") +
annotate("text",
x=stateOrderDate-9,
y= yrng[2]*1.3,
label= "State Order",
hjust = 1, vjust = 1, size =3,
color = "darkgray")
# show the plot
ggplotly(p)
Cumulative Cases: Bending of this curve will be strongly influenced by the number of tests performed. King Co does not report this number, but WA State does: the number per rose to a peak of roughly 5000/day in the 3rd week of March, and has fallen slightly since the beginning of April. Fewer tests can lead to fewer cases, all else equal.
Cumulative Deaths: Two factors influence this curve. The first is delays in reporting, which bias the results, and the second is age-composition of the infected population, which influences interpretation of the results.
Delays in reporting – This leads to underestimates in the last week of counts, so any recent changes in slope should be discounted.
We ignore the last week of data when calculating the growth rate to exclude delay-induced declines.
Age-composition of cases – If the age composition of the infected population is changing, changes in the mortality count curve may not reflect changes in the overall growth of infections. In particular, if the epidemic was initially established in older populations, and then began to move out into the general population, one would expect to see higher mortality rates during the first period, and lower rates in the second. This would not indicate a reduction in the growth of infections.
There is evidence that the age composition of the KC caseload is trending younger (see the next section).
We would urge caution when interpreting any “flattening” of the mortality count curve until a proper age-adjusted analysis can be performed.
The new case curve appears to be falling since the State Order on March 23. The decline is substantial. Some caution is in order given that the last week’s numbers will be influenced by reporting delays, and we do not know how many people are being tested in King County. But it seems unlikely that either of these would change the fundamental pattern.
repDelay <- data.frame(Date = lastWeek,
kc_nCases = rep(max(KCdata$kc_nCases, na.rm=T)))
ggplot(KCdata,
aes(x=Date, y=kc_nCases)) +
geom_col(alpha = 0.5) +
geom_smooth(fill = "#3366FF", alpha = 0.2) +
geom_vline(xintercept = as.integer(vlinedates[2]),
size = .5,
color = "gray") +
annotate("text",
x=stateOrderDate-1, y=max(KCdata$kc_nCases, na.rm=T),
label= "State Order",
hjust = 1, vjust = 1, size = 3,
color = "darkgray") +
geom_ribbon(data=repDelay,
aes(ymin=0, ymax=kc_nCases),
color="gray", alpha=0.4) +
annotate("text",
label = "Rep.Delay",
size = 3, color = "gray",
x=repDelay$Date[4],
y=max(KCdata$kc_nCases, na.rm=T)+5)
labs(x = "Date", y = "Number",
title = "New Cases per Day",
caption = "Data Source: PHSKC Coronavirus Dashboard")
$x
[1] "Date"
$y
[1] "Number"
$title
[1] "New Cases per Day"
$caption
[1] "Data Source: PHSKC Coronavirus Dashboard"
attr(,"class")
[1] "labels"
The death count curve is also falling. Here, more caution is required as the death rate will fall because the age distribution of the caseload is trending younger (see the next section on Changes in Age Composition).
repDelay <- data.frame(Date = lastWeek,
kc_nDeaths = rep(max(KCdata$kc_nDeaths, na.rm=T)))
ggplot(KCdata,
aes(x=Date, y=kc_nDeaths)) +
geom_col(alpha = 0.5) +
geom_smooth(color = "indianred", fill = "indianred", alpha = 0.2) +
geom_vline(xintercept = as.integer(vlinedates[2]),
size = .5,
color = "gray") +
annotate("text",
x=stateOrderDate-1, y=max(KCdata$kc_nDeaths, na.rm=T)+1,
label= "State Order",
hjust = 1, size = 3,
color = "darkgray") +
geom_ribbon(data=repDelay,
aes(ymin=0, ymax=kc_nDeaths),
color="gray", alpha=0.4) +
annotate("text",
label = "Rep.Delay",
size = 3, color = "gray",
x=repDelay$Date[4],
y=max(KCdata$kc_nDeaths, na.rm=T)+1) +
labs(x = "Date", y = "Number",
title = "New Deaths per Day",
caption = "Data Source: PHSKC Coronavirus Dashboard")
The new case counts are sufficiently noisy that the trend is best observed for the cumulative cases.
ggplot(filter(KCbyAge, agecat != "all" & agecat != "unk"),
aes(x = as.Date(Date), y=cCaseFrac,
group=agecat, fill=agecat)) +
geom_area(na.rm = T, alpha = 0.5) +
xlim(as.Date("2020-02-15", origin=origin), Sys.Date()) +
labs(title = "KC Age Composition of Cumulative Cases",
x = "Date",
y = "Fraction")
ggplot(filter(KCbyAge, agecat != "all" & agecat != "unk"),
aes(x = as.Date(Date), y=cDeathFrac,
group=agecat, fill=agecat)) +
geom_area(na.rm = T, alpha = 0.5) +
xlim(as.Date("2020-02-15", origin=origin), Sys.Date()) +
labs(title = "KC Age Composition of Cumulative Deaths",
x = "Date",
y = "Fraction")
King County currently comprises 43% of the state total.
# make sure factor order of comp.long is correct for consistent labeling
p <- ggplot(data= filter(comp.long,
variable %in% c("wa_cCases",
"kc_cCases",
"oc_cCases")),
aes(x = Date, y=value, color=variable)) +
geom_point(alpha = .6) +
scale_color_discrete(labels = c("WA State Total",
"King County",
"Other Counties")) +
labs(title = "Cumulative Diagnosed Cases",
color = "Region",
x = "Date",
y = "Number")
plot(p)
King County has consistently reported the majority of Covid-19 deaths in the State. It currently comprises 54% of the state total.
# truncate x axis at presumed first death
xrng = c(as.Date("2020-02-29"), lastRepDate)
# check order in comp.long for consistent labeling
p <- ggplot(data= filter(comp.long, variable %in% c("wa_cDeaths",
"kc_cDeaths",
"oc_cDeaths")),
aes(x = Date, y=value, color=variable)) +
geom_point(alpha = .6) +
xlim(xrng) +
scale_color_discrete(labels = c("WA State Total",
"King County",
"Other Counties")) +
labs(title = "Cumulative Deaths",
subtitle = "x-axis truncated at presumed first death date 2020-02-29",
x = "Date",
y = "Number",
color = "Region")
plot(p)
The death rates for all regions are falling, which suggests that the age-composition of the caseload is becoming younger.
The death rate in King County is, consistently, disproportionately high, especially given the fact that the age composition of the county’s general population is younger than the rest of the state. The difference could reflect a true difference in death rates, for example, if KC had relatively more cases among the elderly. Or it could instead be an artifact of testing rates, for example, if KC has relatively higher prevalence, the limited testing will lower the case ascertainment rate, but probably not the death ascertainment rate.
# check order in comp.long for consistent labeling
p <- ggplot(data= filter(comp.long,
variable %in% c("wa_dthpercase" ,
"kc_dthpercase",
"oc_dthpercase")),
aes(x = Date, y=value, color=variable)) +
geom_point(alpha = .6) +
xlim(xrng) +
scale_y_continuous(labels = comma,
breaks = seq(0, 30, 5),
limits = c(0, 30)) +
scale_color_discrete(labels = c("WA State Total",
"King County",
"Other Counties")) +
labs(title = "Death Rate per 100 Diagnosed Cases",
subtitle = "x-axis truncated at presumed first death date 2020-02-29",
# subtitle = "Y-axis truncated at 15",
x = "Date",
y = "Death Rate",
color = "Region")
plot(p)
The plots of our estimates here, and for the rest of the report, show the values estimated with the KC age-adjusted fatality rates (the grey-scale lines), and the sensitivity range for the fatality rate estimates (the bands around the lines), for each assumed lag value (the color of the lines/bands). Where appropriate, the observed diagnosed cases are plotted for reference.
myBlues = brewer.pal(n = 9, "Blues")[5:9] #exluded the two lighter hues
myGreys = brewer.pal(n = 9, "Greys")[5:9] #exluded the two lighter hues
x_annotate_fr <- lastEstDate+2
y_annotate_max <- max(tot$kcminIFR, na.rm=T) #inverse proportional
y_annotate_min <- max(tot$kcmaxIFR, na.rm=T)
y_annotate_kc<- max(tot$kcIFR, na.rm=T)
# uses IFR
ggplot(data=tot,
aes(x = Date)) +
geom_ribbon(aes(ymin = kcmaxIFR, ymax = kcminIFR,
group = Lag, fill=factor(Lag)),
alpha=0.4) +
scale_fill_manual(values = myBlues) +
scale_y_continuous(labels = comma) +
geom_line(aes(y=kcIFR,
group = Lag,
color = factor(Lag)),
size = 1,
show.legend = F) +
scale_color_manual(values = myGreys) +
geom_line(data = KCdata,
aes(x = Date,
y = kc_cCases,
color = "kc_cCases"),
color="darkgoldenrod1") +
geom_point(data = KCdata,
aes(x = Date,
y = kc_cCases),
color="darkgoldenrod", alpha = 0.5) +
annotate("text", label= "obsDx Cases",
x=Sys.Date()-delay,
y=(max(KCdata$kc_cCases)*2),
hjust = 1, vjust = 0, size = 3.5,
color = "darkgoldenrod4") +
geom_vline(xintercept = as.integer(vlinedates[2]),
size = .5,
color = "gray") +
annotate("text",
x=stateOrderDate-1, y=y_annotate_max*1.1,
label= "State Order",
hjust = 1, vjust = 1, size = 3,
color = "darkgray") +
annotate("text", x=x_annotate_fr, y=y_annotate_max*1.1,
label= "IFR:",
hjust = 0, size = 4,
color = myGreys[4]) +
annotate("text", x=x_annotate_fr, y=y_annotate_max,
label= percent(kcminIFR, .1),
hjust = 0, size = 3,
color = myGreys[4]) +
annotate("text", x=x_annotate_fr, y=y_annotate_kc,
label= paste(percent(kcIFR, .1), "(KC)"),
hjust = 0, size = 3,
color = myGreys[4]) +
annotate("text", x=x_annotate_fr, y=y_annotate_min,
label= percent(kcmaxIFR, .1),
hjust = 0, size = 3,
color = myGreys[4]) +
labs(title = "Estimated Total Number of Infected Persons",
subtitle = "Depends on IFR only",
x = "Date",
y = "Infections",
fill = "Lag (days)")
The log curve is beginning to flatten – this is driven by the mortality curve we saw in the observed data.
# Set axis limits c(min, max)
xmin <- firstDeathDate - max(lag) # restrict x-axis for ribbon plotting
xmax <- NA
ymin <- 10
ymax <- NA
# Uses IFR
ggplot(data=tot,
aes(x = Date)) +
geom_ribbon(aes(ymin = kcmaxIFR, ymax = kcminIFR,
group = Lag, fill=factor(Lag)),
alpha=0.4) +
scale_fill_manual(values = myBlues) +
scale_y_log10(labels = comma,
limits = c(ymin, ymax)) +
scale_x_date(limits = c(xmin, xmax)) +
geom_line(aes(y=kcIFR,
group = Lag,
color = factor(Lag)),
size = 1,
show.legend = F) +
scale_color_manual(values = myGreys) +
geom_line(data = KCdata,
aes(x = Date,
y = kc_cCases,
color = "kc_cCases"),
color="darkgoldenrod1") +
geom_point(data = KCdata,
aes(x = Date,
y = kc_cCases),
color="darkgoldenrod", alpha = 0.5) +
geom_vline(xintercept = as.integer(vlinedates[2]),
size = .5,
color = "gray") +
annotate("text",
x=stateOrderDate-1, y=y_annotate_max*1.5,
label= "State Order",
hjust = 1, vjust = 1, size = 3,
color = "darkgray") +
annotate("text", label= "obsDx Cases",
x=Sys.Date()-1,
y=(max(KCdata$kc_cCases)*0.4),
hjust = 1, size = 3.5,
color = "darkgoldenrod4") +
annotate("text",
label= "IFR:",
x=x_annotate_fr, y=y_annotate_max*1.5,
hjust = 0, size = 3,
color = myGreys[4]) +
annotate("text",
label= percent(kcminIFR, .1),
x=x_annotate_fr, y=y_annotate_max,
hjust = 0, size = 2,
color = myGreys[4]) +
annotate("text",
label= paste(percent(kcIFR, .1), "(KC)"),
x=x_annotate_fr, y=y_annotate_kc,
hjust = 0, size = 2,
color = myGreys[4]) +
annotate("text",
label= percent(kcmaxIFR, .1),
x=x_annotate_fr, y=y_annotate_min,
hjust = 0, size = 2,
color = myGreys[4]) +
labs(title = "Estimated Total Number of Infected Persons",
subtitle = "Depends on IFR only",
x = "Date",
y = "Infections",
fill = "Lag (days)")
Lots of numbers here, given the multiple sensitivity analyses. We break them up by the lag period. Note: The values are NA until the \(lag\) days before the first observed death: 2020-01-17, on page 2 of the table.
The estimated nDx infections are given by the estimated Total (from above) minus the estimated Dx infections.
#myBlues = brewer.pal(n = 9, "Blues")[5:9] #exluded the two lighter hues
#myGreys = brewer.pal(n = 9, "Greys")[5:9] #exluded the two lighter hues
y_ann = c(max(ndx$kcminFRs, na.rm=T),
max(ndx$kcFRs, na.rm=T),
max(ndx$kcmaxFRs, na.rm=T)*1.3)
# Uses both FRs
ggplot(data=ndx,
aes(x = Date)) +
geom_ribbon(aes(ymin = kcmaxFRs, ymax = kcminFRs,
group = Lag, fill=factor(Lag)),
alpha=0.4) +
scale_fill_manual(values = myBlues) +
scale_y_continuous(labels = comma) +
geom_line(aes(y=kcFRs,
group = Lag,
color = factor(Lag)),
size = 1,
show.legend = F) +
scale_color_manual(values = myGreys) +
geom_line(data = KCdata,
aes(x = Date,
y = kc_cCases,
color = "kc_cCases"),
color="darkgoldenrod1") + #,
# show.legend = T) +
geom_point(data = KCdata,
aes(x = Date,
y = kc_cCases),
color="darkgoldenrod", alpha = 0.5) +
annotate("text", label= "obsDx Cases",
x=as.Date(lastRepDate), y=0,
hjust = 1, vjust = 1,
size = 3,
color = "darkgoldenrod4") +
geom_vline(xintercept = as.integer(vlinedates[2]),
size = .5,
color = "gray") +
annotate("text",
x=stateOrderDate-1, y=(y_ann[1]*1.1),
label= "State Order",
hjust = 1, vjust = 1, size = 3,
color = "darkgray") +
annotate("text",
label= "Fatality Rates:",
x=x_annotate_fr, y=(y_ann[1]*1.1),
hjust = 0, size = 3,
color = myGreys[4]) +
annotate("text",
label= "minimum",
x=x_annotate_fr, y=y_ann[1],
hjust = 0, size = 3,
color = myGreys[4]) +
annotate("text",
label= "KC estimates",
x=x_annotate_fr, y=y_ann[2],
hjust = 0, size = 3,
color = myGreys[4]) +
annotate("text",
label= "maximum",
x=x_annotate_fr, y=y_ann[3],
hjust = 0, size = 3,
color = myGreys[4]) +
labs(title = "Estimated Never Diagnosed Infections",
subtitle = "Depends on both IFR and CFR",
x = "Date",
y = "Infections",
fill = "Lag")
Lots of numbers here, given the multiple sensitivity analyses. We break them up by the lag period. Note: The values are NA until the \(lag\) days before the first observed death: 2020-01-17, on page 2 of the table.
This plots the Total infections minus the observed diagnosed cases from surveillance data (Total - obsDx), as a fraction of the Total. Given the large fraction of persons that are estimated to never be diagnosed, and the delay in diagnosis, this unobserved fraction is above 90% over the entire period.
#myBlues = brewer.pal(n = 9, "Blues")[5:9] #exluded the two lighter hues
#myGreys = brewer.pal(n = 9, "Greys")[5:9] #exluded the two lighter hues
y_lim = c(.75, 1)
# Uses only IFR
ggplot(data=unobstotFrac,
aes(x = Date)) +
geom_ribbon(aes(ymin = kcmaxIFR, ymax = kcminIFR,
group = Lag, fill=factor(Lag)),
alpha=0.4) +
scale_fill_manual(values = myBlues) +
ylim(y_lim) +
geom_line(aes(y=kcIFR,
group = Lag,
color = factor(Lag)),
size = 1,
show.legend = F) +
scale_color_manual(values = myGreys) +
annotate("text",
x=as.Date("2020-01-25"), y=y_lim[1]+.025,
hjust = 0,
label= paste("Note: y axis starts at", percent(y_lim[1], .1)),
color = "darkgray") +
labs(title = "Estimated Unobserved Fraction of Total Caseload",
subtitle = "Depends on IFR only",
x = "Date",
y = "Fraction",
fill = "Lag")
This restricts the focus to infections that will be diagnosed (Dx) and plots the unobserved infections (Dx - obsDx) as a fraction of the infections that will be diagnosed. This fraction dips below 85% at the end of the observable window.
myBlues = brewer.pal(n = 9, "Blues")[5:9] #exluded the two lighter hues
myGreys = brewer.pal(n = 9, "Greys")[5:9] #exluded the two lighter hues
y_lim = c(0.75, 1)
# Uses both FRs
ggplot(data=unobsdxFrac,
aes(x = Date)) +
geom_ribbon(aes(ymin = kcmaxCFR, ymax = kcminCFR,
group = Lag, fill=factor(Lag)),
alpha=0.4) +
scale_fill_manual(values = myBlues) +
ylim(y_lim) +
geom_line(aes(y=kcCFR,
group = Lag,
color = factor(Lag)),
size = 1,
show.legend = F) +
scale_color_manual(values = myGreys) +
annotate("text",
x=as.Date("2020-01-25"), y=y_lim[1]+.025,
hjust = 0,
label= paste("Note: y axis starts at", percent(y_lim[1], .1)),
color = "darkgray") +
labs(title = "Estimated Unobserved Fraction of Eventually Dx Infections",
subtitle = "Depends on CFR only",
x = "Date",
y = "Fraction",
fill = "Lag")
This plots focuses instead on the observed fraction, and looks ahead from the estimated Dx at time \(t\) to the observed diagnosed cases in surveillance data \(lag\) days later. In essence, it asks "What fraction of the infections that we believe will eventually be diagnosed (\(Dx_t\)), are diagnosed within the estimated mortality lag time in the future (\(obsDx_{t+lag}\)). We take the ratio of these two quantities, so the value is a fraction.
Note: The IFR:CFR ratio implies that 48.4% of infections will never be diagnosed. This plot excludes those infections.
mycolor = brewer.pal(n = 9, "OrRd")[5]
y_lim = c(0, 0.60)
# Uses both FRs
ggplot(data=obsdxbylagFrac,
aes(x = Date)) +
geom_ribbon(aes(ymin = kcmaxCFR, ymax = kcminCFR),
fill = mycolor,
alpha=0.6) +
scale_fill_manual(values = myBlues) +
ylim(y_lim) +
geom_line(aes(y=kcCFR),
size = 1,
color = myGreys[3],
show.legend = F) +
annotate("text",
x=as.Date("2020-01-25"), y=y_lim[2]-0.025,
hjust = 0,
label= paste("Note: y axis stops at", percent(y_lim[2], .1)),
color = "darkgray") +
labs(title = "Fraction of Eventually Dx Infections Observed Within the Mortality Lag Period",
subtitle = "Depends on CFR only",
x = "Date",
y = "Fraction")
Lots of numbers here, given the multiple sensitivity analyses, and the different denominators used. Since there is little variability in the unobserved fractions (based on both the Total and the Dx infections) we report only the fraction of Dx infections that are observed (in surveillance data, obsDx) within the mortality lag period.
Note: The values are NA until the \(lag\) days before the first observed death: 2020-01-17, on page 2 of the table.
Note: The IFR:CFR ratio implies that 48.4% of infections will never be diagnosed. This table excludes those infections.
We estimate the Total COVID-19 infections in King County over time, and derive the number of infections that will never be diagnosed and the unobserved fraction of those that will be diagnosed.
The most recent estimates are for 2020-03-31 (given the midpoint 17 day lag period and the current 7 day delay in KC DOH case reports).
This plot is based on the sensitivity parameter midpoint values:
DT::datatable(keytable, caption = "Table of Key Estimates",
options = list(columnDefs = list(list(
className = 'dt-right', targets = 1:3
))))
The cumulative fraction of people infected in King County was estimated to be 2.0% (sensitivity range 1.2% - 5.2% as of 2020-03-31. If mortality counts are underestimated, true exposure rates may be higher than these estimates (see sensitivity)
As of 2020-03-31
None of the estimates are increasing by much. All of the cumulative curves have essentially “flattened”
Note that, with our method, this is driven entirely by the observed mortality counts: when those drop, these curves begin to flatten, and when new mortality counts drop to zero, the estimated curves will be flat.
Key questions that need to be answered before we can confidently draw that conclusion are:
The mortality count curve declines began shortly after the State Order on March 23. While the ultra-low counts in the past week are probably influenced by reporting delays, the trend is now long enough, and large enough, that it is clearly not an artifact.
The age distribution is getting younger. So the flattening of the mortality curves should be interpreted with care until a full age-adjusted analysis is completed.
Estimated unobserved fractions, relative to total infections, and to the infections that will eventually be diagnosed.
These fractions are based on two quantities: infected persons who will never be diagnosed (nDx), and those we have observed in surveillance data (obsDx). The nDx fraction is constant – that is a result of the method we are using.
The unobserved fractions are falling because we are capturing more of the infections (that will eventually be diagnosed). That is most likely due to the increase in testing.
All of the estimates rely on our assumptions regarding the infection fatality rate (IFR), the case fatality rate (CFR) and the lag time from Dx to death. Since these are not known with certainty, we examined the sensitivity of our estimates to a range of values, where the range is taken from the 95% credible intervals from a recent analysis of data from China by Verity, et al.
These fatality rate estimates have been age-adjusted to the King County age distribution.
Infection count estimates are highly sensitive to the two fatality rate assumptions. For both, Lower fatality rates lead to higher estimates of the relevant infected population, other things equal. The IFR determines the Total infection estimates; the CFR determines the Dx and nDx infection estimates. The ratio of the two rates (IFR/CFR) influences the estimates of the Dx and nDx fractions: Higher ratios of these two rates lead to lower estimates of the never diagnosed fraction.
The age-specific rate estimates we use are based on data from Wuhan, China. Since we have age-adjusted the overall rates, the age composition of the population has already been accounted for.
Other proximate factors that would influence fatality rate estimates are hospital capacity, access to care and the efficacy of treatment. If these factors are different in Wuhan than in King County,
This would be expected to have greater impact on the CFR estimate, which applies only to diagnosed cases. Diagnosis is likely related to greater symptom severity. So these cases are more likely to progress to a stage requiring hospitalization, and access to effective care will influence their mortality risk.
It will probably have less influence on the IFR estimate, since this rate includes the large fraction of cases that are never diagnosed. The majority of these nDx infections are likely asymptomatic or too mild to qualify for access to testing. They have little risk of mortality so their inclusion lowers the overall fatality rate, and makes it less sensitive to access to care.
There could be substantial disparities in access to effective care within the local King County population. This is an important topic that should be explored further.
Our estimates were not very sensitive to the assumed lag time from Dx to death, but this is likely due to the narrow range of lag times examined (3 days). This range is taken from the 95% credible interval reported in Verity, et al.. The direction of the sensitivity is still clear, but with a caveat. Shorter lag times lead to lower estimates of the total infections at any point in time. The caveat is that the lagged estimated curves are simply time-shifted versions of each other.
Once the cumulative epidemic curve is flattened – so there are almost no new cases – the lag time will no longer have much influence on the estimates. And that is what we observe here.
The single most important data input for this method of backcalculation is the mortality count surveillance data. Those data determine the estimated total infections (given the sensitivity parameters we examine above).
While mortality counts are likely to be more accurate estimates of the true population Covid-19 fatalities than observed diagnoses are of the true number of infected persons, recent reports suggest that mortality counts may substantially underestimate the true toll of COVID-19. One known problem is that mortality counts routinely exclude cases that die at home, as these fatalities are rarely tested. New York City recently increased its mortality count by 40% to reflect presumptive Covid-19 fatalities at home.
If mortality counts are underestmated, the simple rule of thumb would be that each additional death would imply 132 additional infected persons, and a doubling of the mortality count would lead to a doubling of the total estimated infections.
The current cumulative mortality count in King County is 339, and our estimate of the total infection caseload is 44,803. If the true mortality count is twice as high, then the estimate of the total caseload would be 89,606.
Note that:
King County has a higher ratio of deaths per observed case counted in surveillance data than WA State as a whole (see above.
Variations in the level of ascertainment across counties could influence our estimates.
The cumulative total infected persons at any time \(t\) is the sum of the infected persons that will be diagnosed at some point (Dx) and those that will never be diagnosed (nDx). \[ \begin{aligned} Total_t & = Dx_{t} + nDx_{t} \end{aligned} \]
Note that the Dx infections at any time \(t\) is defined here as the infections that will ever be diagnosed, not just the cases that are already diagnosed and captured in the surveillance data; it is the sum of the infections that will be diagnosed in the future, and those that have already been diagnosed.
\[ \begin{aligned} Dx_t & = futureDx_{t} + obsDx_{t} \end{aligned} \] The cumulative total number of infected persons today are related to the cumulative death counts in a future period (\(M_{t+lag}\)) by the infection fatality rate (IFR) and the average lag time from infection to death \(lag\), using the simple backcalculation formula:
\[ \begin{aligned} Total_t * IFR & = M_{t+lag} \\ Total_t & = M_{t+lag}/IFR. \end{aligned} \] The current Dx infections are also related to the death counts in a future period, by the case fatality rate (CFR) and the average lag time from infection to death, by an analogous formula:
\[ \begin{aligned} Dx_t * CFR & = M_{t+lag} \\ Dx_t & = M_{t+lag}/CFR. \end{aligned} \]
The nDx infections are then easily obtained by subtraction.
\[ \begin{aligned} nDx_{t} & = Total_t - Dx_t \end{aligned} \]
A final implication of the definitions used for this method is that the fraction of cases that will and will never be diagnosed is a simple function of the two fatality rates. The intuition is that the IFR is the weighted sum of the CFR, and the fatality rate among those never diagnosed , where the weights are the fraction of infections that are diagnosed, and never diagnosed, respectively. We assume that all deaths due to infection are correctly ascertained, so the fatality rate for the nDx infections is 0.
\[ \begin{aligned} IFR & = CFR * DxFraction + 0 * nDxFraction \\ \frac{IFR}{CFR} & = DxFraction \end{aligned} \]
and the expected nDx fraction is \(1 - \frac{IFR}{CFR}\).
One implication of this is that these fractions are constant, they do not vary over time.
A common metric used in epidemiology is the “Undiagnosed Fraction”. But in this backcalculation context we need to make a distinction between the estimated fraction of infections that are never Diagnosed, and the fraction of estimated infections that will be diagnosed, but are are currently unobserved. The nDx infection fraction is constant, but the unobserved fraction varies over time, based on case ascertainment.
There are 3 different useful metrics for the unobserved fraction.
\[ \begin{aligned} unobsTotFraction_t & = \frac{Total_t - obsDx_t}{Total_t} \\ \end{aligned} \]
Alternatively, we can restrict the focus to eventually diagnosed infections that are currently unobserved: \[ \begin{aligned} unobsDxFraction_t & = \frac{Dx_t - obsDx_t}{Dx_t} \\ \end{aligned} \]
A final metric also restricts the focus to the infections we predict will be eventually diagnosed, and calculates the fraction of these that are observed (via surveillance) within the mortality lag period.
\[ \begin{aligned} obsDxbyLagFraction_t & = \frac{obsDx_t}{Dx_{t-lag}} \\ & = \frac{obsDx(t)}{{\frac{M(t)}{CFR}}} \\ & = CFR * \frac{obsDx(t)}{M(t)} \end{aligned} \]
This metric answers the question: "What fraction of the infections that we predicted would eventually be diagnosed (\(Dx_{t-lag}\)), are diagnosed by the estimated mortality lag time in the future (\(obsDx_{t}\)).
These are the 3 metrics presented in the Unobserved Infection section.
The primary predictions here are the Total number of infections at each point in time, broken down into those that will never be diagnosed (nDx) and those that will eventually be diagnosed. The estimation method relies on a single data time series as input: the cumulative death counts from COVID-19 recorded in surveillance data.
For the estimated unobserved fractions we rely also on the observed diagnosed caseload count series (obsDx). Note, these count are not used to infer the total caseload, only to estimate what we have “seen”, and what remains unobserved.
All of the other inputs – IFR, CFR and \(lag\) – are parameter estimates that we draw from the current best estimates in the literature, specifically Verity, et al., Lancet 3/30/2020, and modify for King County when possible.
Given the uncertainty in the parameter estimates, we will examine the sensitivity of our results to a range of values. The range is drawn from the 95% credible intervals in Verity, et al., values and derivation are shown in the next section.
Three parameter estimates are needed as inputs for this method:
COVID-19 fatality rates depend strongly on age, so we calculate an age-adjusted estimate for King County using the Verity, et al., age-specific estimates. For comparison, we also calculate the estimates for WA State as a whole, and all other counties in WA state, using the same method.
It is difficult to get an accurate estimate of the fatality rates in the midst of an epidemic. The true values can only be calculated at the end of the epidemic, once all of the cases and deaths have been ascertained, and even then, it is important to correct for under-ascertainment. For an infection like Covid-19, where fatality rates are highly sensitive to age, age-specific estimates allow for adjustment when used for populations with different age distributions.
The best current estimates of the age-specific CFRs and IFRs for COVID-19 are based on the data from Wuhan, China, where the epidemic was sizeable, brought (largely) under control, and the number of new cases now remains very low.
These estimates are shown below in rate per hundred (source: Verity, et al., Lancet 3/30/2020). They have been adjusted for demographic composition and under-ascertainment, but assume attack rates are homogeneous by age.
An overall age-adjusted fatality rate for any population can be estimated by taking a weighted average of these age-specific rates, where the weights are the population age distribution fractions.
# taken from Verity, et al. Lancet, 3/30/2020
# NOTE: these are modified from percent to rate scale for calcs.
DT::datatable(estFRs,
options = list(
columnDefs = list(
list(className = 'dt-left', targets = 1))),
caption = "Estimated Age-specific CFRs and IFRs (source: Verity, et al.) expressed as %") %>%
DT::formatPercentage(col = c("IFR", "CFR"), digits=3)
The population data used to estimate the age distribution comes from the WA State Office of Financial Management.
# Compare age distn's
DT::datatable(data.frame("Age.Range" = WA_age_distn$Age.Range,
"WA Total" = WA_age_distn$Age.Frac,
"King County" = KC_age_distn$Age.Frac,
"Outside KC" = OW_age_distn$Age.Frac),
caption = "Age distributions: WA State, King County, and Outside King County") %>%
DT::formatPercentage(c(2:4), digits = 1)
Three parameters can be estimated from this, the overall age-adjusted IFR and CFR, and the never diagnosed fraction. For comparison, the IFR and CFR estimates for WA, King County, and outside King County are shown in the table below (expressed as percents). The sensitivity ranges represent the min and max values based on the credible intervals from Verity, et al.
DT::datatable(frtable,
caption = "Age-specific IFR and CFR estimates with sensitivity ranges") %>%
DT::formatPercentage(c('WA.State', 'King.County', 'Outside.KC'), digits = 2)
Note that both the IFR and CFR are lower for King County than the rest of the state. This is a direct effect of the younger age distribution in King County. Since these rates are in the denominator of the infection count estimates (Total and Dx), this means the King County estimates, per observed Covid-19 fatality, will be proportionately higher than for WA State as a whole.
The IFR and CFR have the same age-fraction weights within each region, so their ratio will not vary much across the regions, which means the nDx estimates are quite similar.
Never Diagnosed Fraction (nDx, as %) = \((1 - \frac{IFR}{CFR})\%\) = 48.4%.
(see Methodology Overview section) for details on the derivation of this formula.
With jurisdiction-specific age distribution data, age-adjusted estimates can be obtained for any jurisdiction.
The distributions of IFR and CFR estimates for each county in Washington State are shown below.
# lots of tinkering to get this right :/
vlines <- data.frame(values = c(kcIFR, kcCFR), rateType = c("IFR","CFR"))
cols <- hue_pal()(2)
ggplot(data=melt(county_FRs[,-4], variable.name="rateType"),
aes(value, group=rateType,
fill=rateType, stat(count))) +
xlim(0, .05) +
geom_density(alpha=0.5) +
geom_vline(data=vlines,
aes(xintercept = values, color = rateType),
size = 1) +
scale_color_manual(values = rev(cols)) +
guides(fill = guide_legend(order = 1),
colour = guide_legend(order = 0, reverse=T))+
labs(title = "Age-adjusted fatality rate distributions (IFR and CFR) for WA Counties",
x = "Age-adjusted Fatality Rate",
y = "Count",
fill = "Rate (%)",
color = "KC Estimates")
Warning in melt(county_FRs[, -4], variable.name = "rateType"): The melt generic
in data.table has been passed a data.frame and will attempt to redirect to the
relevant reshape2 method; please note that reshape2 is deprecated, and this
redirection is now deprecated as well. To continue using melt methods from
reshape2 while both libraries are attached, e.g. melt.list, you can prepend
the namespace like reshape2::melt(county_FRs[, -4]). In the next version, this
warning will become an error.
Using County as id variables
ggplot(data=county_FRs,
aes(nDx)) +
geom_density(stat="density", fill = "lightblue3", alpha=0.5) +
geom_vline(data=data.frame(kcndxFrac),
aes(xintercept = kcndxFrac),
color = "lightblue4",
size = 1)+
xlim(.45, .50) +
annotate("text",
x=0.485, y=200,
label= "KC nDx",
hjust = 0, vjust = 0, size = 4,
color = "gray36") +
labs(title = "Age-adjusted never diagnosed fraction (nDx) distribution for WA Counties",
x = "Age-adjusted nDx Fraction",
y = "Density")
All valuesaexpressed as percents (rate per 100).
data.frame("County" = county_FRs[,"County"],
"IFR" = percent(county_FRs[,"IFR"], .2),
"CFR" = percent(county_FRs[,"CFR"], .2),
"nDx" = percent(county_FRs[,"nDx"], .1)) %>%
DT::datatable(options = list(
columnDefs = list(
list(className = 'dt-left', targets = 1),
list(className = 'dt-right', targets = 2:4)
)))
Verity, et al., provide different estimates of the lag time from infection to death, using alternative subsets of their data and adjustments for measurement error. We will use the posterior estimate based on the aggregated data from China: \(17.8\) days, rounded down to \(17\), and the range they report for that estimate, \(16 - 19\) days. That is a narrow range, but it is the best estimate currently available.
Note that one implication of the simple backcalculation is that, while each estimate of the Total (or Dx) time series curve depends on the lag time selected, other lag values will simply shift this curve (along the time axis) by the difference in the lags.