-
Notifications
You must be signed in to change notification settings - Fork 1
/
garn-panda.Rmd
391 lines (315 loc) · 15.5 KB
/
garn-panda.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
---
title: Characterizing cemented sandstones with physics-based and machine learning
approaches
author: "Frank Male"
date: "`r format(Sys.Date(), '%d %B %Y')`"
output:
github_document:
pandoc_args: --webtex=http://chart.apis.google.com/chart?cht=tx&chl= # --mathjax
pdf_document: default
word_document: default
html_document: default
header-includes: \usepackage{amsmath}
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE)
require(XLConnect)
require(MASS)
require(tidyverse)
require(qqplotr)
require(ggpubr)
require(latex2exp)
require(janitor)
require(caret)
require(recipes)
require(iml)
library(reshape2)
library(ggridges)
require(dplyr)
require(doParallel)
figdir <- "figures/"
theme_set(theme_light())
df <- readWorksheetFromFile("C:/Users/malef/Dropbox/ROZ/data/GARN1990.xlsx", sheet=1) %>%
mutate_at(vars(CD:IMP), ~ as.numeric(.)) %>%
mutate(WELL = str_trim(WELL)) %>%
## Replace zeros in interparticle porosity and cement content to a value below the lowest measurable quantity
mutate(IMP = replace(IMP, IMP==0, 0.2),
ICL = replace(ICL, ICL==0, 0.1),
QCM = replace(QCM, QCM==0, 0.1)) %>%
select(WELL, KLH, POR, IMP, GS, SO, KAO, ICL, QCM, CAL, DOL) %>%
drop_na()
#str(df)
df_name <-readWorksheetFromFile("C:/Users/malef/Dropbox/ROZ/data/GARN1990.xlsx", sheet=2) %>%
rename(Var1=EXPLANATION.,explanation=Col2)
erfinv <- function(x) qnorm((x + 1)/2)/sqrt(2)
df <- df %>%
mutate(POR = POR/100,
IM_POR = IMP/100,
GS = GS*1e3,
mu = log(GS),
sigma = log(SO) / (sqrt(2) * erfinv(0.5)),
mean_GS = exp( mu + sigma/2),
Cv_GS = sqrt( exp( sigma^2) - 1),
gamma_GS = (exp(sigma^2) + 2) * Cv_GS,
tau_o = (0.9*IM_POR/(1-0.1*IM_POR))^(-0.378*2),
tau_u = tau_o * (1+Cv_GS),
a_o = 6 * mean_GS^-1,
a_u = 6* (sigma^2 + mean_GS^2)/(gamma_GS * sigma^2 + 3*mean_GS*sigma^2 + mean_GS^3),
CK_void_fraction = IM_POR^3/(1-IM_POR)^2#,
)
#now for the perm predictions
df <- df %>%
mutate(k_pl94_por = (mean_GS^2 * POR^3)/(72* tau_u * (1-POR)^2) * (gamma_GS * Cv_GS^3 + 3*Cv_GS^2 + 1)^2/(1 + Cv_GS^2)^2,
k_pl94_impor = (mean_GS^2 * IM_POR^3)/(72* tau_u * (1-IM_POR)^2) * (gamma_GS * Cv_GS^3 + 3*Cv_GS^2 + 1)^2/(1 + Cv_GS^2)^2
)
df <- df %>%
mutate(P_f = (KAO + QCM + CAL + DOL)/100,
P_b = ICL/100,
POR_u = IM_POR + P_f + P_b,
m = P_f * (1-IM_POR)/IM_POR,
m_b = P_b * (1-IM_POR)/IM_POR,
tau_e = tau_u * (1+Cv_GS) * (1+ 2*m_b/(1-m_b))^2 * (1 + 2*m/((1-m) * IM_POR^(1/3.0)))^2,
a_e_sorta = a_u * (1 - POR_u)/(1-IM_POR) #+ a_b*P_b + a_f*P_f
)
set.seed(42)
holdout.wells <- sample(unique(df$WELL), size=4)
df_holdout <- filter(df, WELL %in% holdout.wells)
df <- filter(df, !WELL %in% holdout.wells)
#head(arrange(df, QCM),20)
```
## Introduction
Sandstones one of the most common types of reservoir rocks. Let's see if we can explain their permeability.
## Narrative
The most well-known physics-based approach to estimating permeability was developed by Kozeny (1927) and later modified by Carman (1937). In its modern form, the equation is written as
$$
k = \frac{\phi^3}{2\tau(1-\phi)^2 a^2},
$$
which, for simplicity, we're going to recast as
$$
k = \frac{\phi_{CK}}{2\tau a^2},
$$
where permeability is $k$, porosity is $\phi$, tortuosity is $\tau$, the specific surface area is $a$, and the Carman-Kozeny void fraction is $\phi_{CK}$. For an uncemented sandstone, tortuosity can be calculated following the derivation in Appendix B, which comes from Panda and Lake (1994). For a cemented sandstone, the tortuosity changes because of cements blocking and forcing modification of the flow paths.
Specific surface area for an uncemented sandstone can be estimated from the particle size distribution, after assuming that the particles are spherical. After cementation, the nature of the cement is important in how the surface area changes. Some cements will coat the walls of the pores, slightly decreasing the specific surface area. Other cements will line or bridge the pores, moderately to greatly increasing the specific surface area.
A competing hypothesis is that pore throat sizes are the most important determinant of permeability-porosity transforms. This appears in the Winland relations that follow the form
$$
\log k = A \log \phi + B \log r + C,
$$
where $r$ is the pore throat radius. Pore throat radius might be more impacted by cements that coat the walls than cements that bridge the pores. Wouldn't that be interesting?
Now, because this is a data-driven approach, let's start by comparing permeability to the Carman-Kozeny void fraction.
```{r CK_void_fraction}
df %>%
ggplot(aes(x=(IMP/100)^3/(1-(IMP/100))^2, y=KLH)) +
geom_point() +
geom_smooth(method="lm", color="steelblue") +
scale_x_log10(TeX("$\\phi^3/(1-\\phi)^2$ using intergranular macroporosity")) +
scale_y_log10(TeX("Permeability (mD)"))
summary(lm(log(KLH) ~ log(CK_void_fraction), data=df))
```
Hey! That's pretty good! The R$^2$ is 0.85, and there are no odd trends. Sure, at low porosity the data resolution starts to be a problem, but that is at 1/100th of the average permeability, and "the permeability is bad" is really all you need to know there. Okay, with that positive result, let's add the grain size distribution to the model and see if we can do even better. With the grain size, we can start talking about the surface area of the pores. Bird et al. (1960) say that permeability is related to the square of the pore radius, which is roughly equivalent to the square of the grain diameter.
```{r CK_GS_porosity}
df %>%
ggplot(aes(x=mean_GS^2 *(IMP/100)^3/(1-(IMP/100))^2, y=KLH)) +
geom_point() +
geom_smooth(method="lm", color="steelblue") +
scale_x_log10(TeX("$D^2\\phi^3/(1-\\phi)^2$ using intergranular macroporosity (micron$^2$)")) +
scale_y_log10(TeX("$k$ (mD)"))
lm(log(KLH) ~ log(k_pred), data = mutate(df, k_pred = mean_GS^2 * CK_void_fraction)) %>%
summary()
```
Well, our Pearson correlation coefficent has gone down to 0.64. Nuts. Well, it's pretty hard to estimate the mean grain size from looking at Beard and Weyl's comparators, so I can understand that. Or... how well-sorted are these grains? Not that well-sorted? Then let's do a specific surface area that takes that into account, with Panda and Lake's derivation.
```{r CK_au_porosity}
df %>%
ggplot(aes(x=CK_void_fraction/a_u^2, y=KLH)) +
geom_point() +
geom_smooth(method="lm", color="steelblue") +
scale_x_log10(TeX("$\\phi_{CK}/a_u^2$ using intergranular macroporosity (micron$^2$)")) +
scale_y_log10(TeX("$k$ (mD)"))
lm(log(KLH) ~ log(k_pred), data = mutate(df, k_pred = CK_void_fraction/a_u^2)) %>%
summary()
```
Okay, so that's not helping the regression. It looks like Carman-Kozeny void fraction is our best main predictor.
Maybe adding the uncemented tortuosity will help. Maybe both tortuosity and the specific surface area are needed. Let's throw it all together, then make a Spearman correlation table as well, for good measure.
```{r CK_Av_tau}
df %>%
ggplot(aes(x = CK_void_fraction / (tau_o * a_u^2), y=KLH)) +
geom_point() +
geom_smooth(method="lm", color="steelblue") +
scale_x_log10(TeX("$\\phi_{CK} / (\\tau a_u^2)$ using intergranular macroporosity (micron$^2$)")) +
scale_y_log10(TeX("$k$ (mD)"))
lm(log(KLH) ~ log(k_pred), data = mutate(df, k_pred = CK_void_fraction/(tau_o * a_u^2))) %>%
summary()
```
Okay, well, the original (pre-compaction) tortuosity is helping things. Of course, it is a function of porosity, so really we're just building more complicated models for explaining porosity's effect on permeability. Also, this isn't really better than just using the Carman-Kozeny void fraction. With that in mind, let's look at tortuosity after taking the variable grain sizes into account.
```{r CK_Av_tau_u}
df %>%
ggplot(aes(x = CK_void_fraction / (tau_u * a_u^2), y=KLH)) +
geom_point() +
geom_smooth(method="lm", color="steelblue") +
scale_x_log10(TeX("$\\phi_{CK} / (\\tau a_u^2)$ using intergranular macroporosity (micron$^2$)")) +
scale_y_log10(TeX("$k$ (mD)"))
lm(log(KLH) ~ log(k_pred), data = mutate(df, k_pred = CK_void_fraction/(tau_u * a_u^2))) %>%
summary()
```
And that helps a bit, but it still isn't as good as just using $\phi_{CK}$. But wait, there's more! Cementation should matter. Let's try the cemented measure of tortuosity. That ought to get us somewhere.
```{r cemented}
df %>%
ggplot(aes(x=CK_void_fraction / (tau_e * a_u^2), y=KLH)) +
geom_point() +
geom_smooth(method="lm", color="steelblue") +
scale_x_log10(TeX("$\\phi_{CK} / (\\tau a_u^2)$ using intergranular macroporosity (micron$^2$)")) +
scale_y_log10(TeX("$k$ (mD)"))
lm(log(KLH) ~ log(k_pred), data = mutate(df, k_pred = CK_void_fraction/(tau_e * a_u^2))) %>%
summary()
```
Oops, switching to effective tortuosity hurts the fit. Okay then, let's add effective surface area. But how? The effective surface area has fitting parameters that we don't know a priori --- the effects of pore lining, bridging, and filling cement on the specific surface area. So, what do we do? Well, let's start by looking at cement volume versus permeability. Then, let's look at the Spearman correlation matrices between these cements and permeability.
```{r}
ggarrange(
df %>%
ggplot(aes(P_b, KLH)) +
geom_point() +
stat_smooth() +
scale_y_log10(breaks = c(1,10,100,1000,10000)) +
labs(x="Fraction pore bridging cement", y="Permeability (mD)")
,
df %>%
ggplot(aes(P_f, KLH)) +
geom_point() +
stat_smooth() +
scale_y_log10(breaks = c(1,10,100,1000,10000)) +
labs(x="Fraction pore filling cement", y="Permeability (mD)")
,
nrow = 1, ncol=2, labels="auto")
df %>%
select(CK_void_fraction, P_b, P_f, KLH) %>%
na.omit() %>%
cor(method="spearman")
```
Ah ha! This matters. Nice, high, Spearman r values showing that pore-filling and pore-bridging cement are bad for permeability. Now, these also happen to be strongly correlated with interparticle porosity and, as one might expect, so the story could be complicated, there. This looks like enough to start setting up a regression. What form should this regression take? Let's take some inspiration from Winland and slightly abuse Panda and Lake's (1995) Carman-Kozeny equation.
After that abuse, the regression equation becomes
$$
\log k = A_1 \log \phi_{CK} - A_2 \log P_b - A_3 \log P_f - A_4 \log a_u - A_5 \log \tau_e + A_0.
$$
Now, to the regressor!
```{r cements_regression}
model_cement <- lm( log(KLH) ~ log(CK_void_fraction) + log(P_b) + log(P_f) + log(a_u) + log(tau_e), data = df, na.action=na.exclude)
#summary(df[, c("CK_void_fraction","P_b","P_f","a_u")])
summary(model_cement)
postResample(predict(model_cement,df), log(df$KLH))
df %>%
mutate(k_pred = exp(predict(model_cement,.))) %>%
ggplot(aes(x = k_pred, y = KLH)) +
geom_point() +
geom_abline(slope=1,intercept=0) +
#geom_smooth(method="lm") +
scale_x_log10() +
scale_y_log10() +
labs(x="Predicted permeability from Winland-style model (mD)", y = "Measured permeability (mD)")
```
Now we're cooking with gas! An R$^2$ of 0.87 is nothing to sneeze at. Also, it's the first time we've improved beyond the straight Carman-Kozeny void fraction relation. The one issue is that this assumes a linear relationship between the cementation of various types and the porosity. The solution here is to go to non-parametric fitting. Now, non-parametric fitting is prone to overfitting, so we're going to have to set up some cross-validation. After that, let's perform some recursive feature elimination to figure out which features are really impacting permeability. Then, let's use a gradient boosting regressor on the significant features.
```{r}
fit_ctr_rfe <- rfeControl(functions = rfFuncs,
index = groupKFold(df$WELL)
)
rf_profile <- rfe(
log(KLH) ~ CK_void_fraction + P_b + P_f + a_u + tau_e,
data = df,
rfeControl = fit_ctr_rfe
)
print(paste("The predictors are:", paste(predictors(rf_profile), collapse = ", ")))
```
This is not a terribly surprising result. Now, to the gradient boosting regressor to see how it all comes together.
```{r}
cl <- makePSOCKcluster(12)
registerDoParallel(cl)
fit_control <- trainControl(index = groupKFold(df$WELL),
allowParallel = TRUE)
# xgb_grid <- expand.grid(nrounds = seq(20, 200, by=10),
# max_depth = 1, #c(1,2,3),
# eta = seq(.01,.16, by=.01),
# gamma = 0.55, #seq(0.4, 0.6, by = 0.05), #c(0, 0.05, 0.1, 0.5, 0.7, 0.9, 1.0),
# colsample_bytree = 0.8, #seq(0.4, 1.0, by=0.2),
# min_child_weight = 6, #c(5,6,7),
# subsample = 1#c(0.5,0.75,1.0)
# )
#
# fit_xgboost <- train(
# log(KLH) ~ CK_void_fraction + tau_e + P_b + P_f,
# data = df,
# method = 'xgbTree',
# trControl = fit_control,
# tuneGrid = xgb_grid
# )
#
# ggplot(fit_xgboost,plotType = "scatter", output="layered", highlight = TRUE)
# fit_xgboost$bestTune
final_grid <- expand.grid(
nrounds = 150,
max_depth = 1,
eta = 0.07,
gamma = 0.55,
colsample_bytree = 0.8,
min_child_weight = 6,
subsample = 1
)
fit_xgboost <- train(
log(KLH) ~ CK_void_fraction + tau_e + P_b + P_f,
data = df,
method = 'xgbTree',
trControl = fit_control,
tuneGrid = final_grid
)
df %>%
mutate(k_pred = exp(predict(fit_xgboost,.))) %>%
ggplot(aes(x = k_pred, y = KLH)) +
geom_point() +
geom_abline(slope=1,intercept=0) +
scale_x_log10() +
scale_y_log10() +
labs(x="Predicted permeability from gradient boosting (mD)", y = "Measured permeability (mD)")
postResample(log( predict(fit_xgboost, df)), log(df$KLH))
postResample(log( predict(fit_xgboost, df_holdout)), log(df_holdout$KLH))
```
And now, the variable importances:
```{r varImp}
library(xgboost)
predictor = Predictor$new(fit_xgboost, data = select(df, CK_void_fraction, tau_e, P_b, P_f), y = log(df$KLH))
imp = FeatureImp$new(predictor, loss = "rmse", n.repetitions = 40)
plot(imp) +
scale_x_continuous("Feature importance (RMSE without feature / RMSE)") +
geom_vline(xintercept = 1.0)
shaps <- xgb.plot.shap(as.matrix(select(df, CK_void_fraction, tau_e, P_b, P_f)),
model=fit_xgboost$finalModel,
top_n=4, n_col=2, pch="o")
data <- merge(melt(shaps$shap_contrib, value.name="SHAP"),
melt(shaps$data, value.name="Value")
)
ggplot(data,aes(x=SHAP,y=Var2, color=Value)) +
geom_jitter() +
#scale_color_distiller(palette="Reds") +
scale_color_viridis_c(option="plasma") +
labs(x="SHAP value", y="Feature")
ggarrange(
ggplot(filter(data, Var2=="CK_void_fraction"), aes(x=Value, y = SHAP)) +
geom_point() +
scale_x_log10() +
labs(x = TeX("$\\phi_{CK}"))
,
ggplot(filter(data, Var2=="tau_e"), aes(x=Value, y = SHAP)) +
geom_point() +
scale_x_log10() +
labs(x = TeX("$\\tau_e"))
,
ggplot(filter(data, Var2=="P_f"), aes(x=Value, y = SHAP)) +
geom_point() +
scale_x_log10() +
labs(x = TeX("$P_f"))
,
ggplot(filter(data, Var2=="P_b"), aes(x=Value, y = SHAP)) +
geom_point() +
scale_x_log10() +
labs(x = TeX("$P_b$"))
,
nrow = 2, ncol = 2)
```
Interesting results, yes?
See the paper for the exciting conclusion!