Merge pull request #52 from gutmann/qq_normal

Qq normal from gutmann
NCAR · Mar 9, 2017 · fd6b2d9 · fd6b2d9
2 parents a12802f + 94c0ccc
commit fd6b2d9
Show file tree

Hide file tree

Showing 12 changed files with 586 additions and 80 deletions.
diff --git a/.gitignore b/.gitignore
@@ -35,4 +35,5 @@ test_regression
 test_calendar
 test_qm
 test_config
+test_random
 gard
diff --git a/.travis.yml b/.travis.yml
@@ -6,7 +6,7 @@ os:
   - linux
 addons:
   apt:
-    source:
+    sources:
       - ubuntu-toolchain-r-test
     packages:
       - gfortran

diff --git a/src/config/configuration.f90 b/src/config/configuration.f90
@@ -97,10 +97,10 @@ subroutine read_base_options(options)
         end_date         = ""
         start_train      = ""
         end_train        = ""
-        start_transform  = ""
-        end_transform    = ""
-        post_start       = ""
-        post_end         = ""
+        start_transform  = "1980-01-01 00:00:00"
+        end_transform    = "1980-01-01 00:00:00"
+        start_post       = ""
+        end_post         = ""
 
         output_file      = "gard_out_"
         n_analogs        = -1
@@ -157,27 +157,24 @@ subroutine read_base_options(options)
 
         ! this is the time period to use when calculating e.g. quantile mapping transformations
         call options%transform_start%init("gregorian")
-        if (start_transform=="") then
-            stop "ERROR must set a transform start date"
+        if (start_transform==end_transform) then
+            write(*,*) "WARNING: If you are using any transforms start and end date should not be the same"
         endif
         call options%transform_start%set(start_transform)
         call options%transform_stop%init("gregorian")
-        if (end_transform=="") then
-            stop "ERROR must set a transform end date"
-        endif
         call options%transform_stop%set(end_transform)
         ! this is the time period to use when calculating e.g. quantile mapping transformations for post processing
-        if (maxval(post_correction_Xform) /= kNO_TRANSFORM) then
+        if (maxval(post_correction_transform) /= kNO_TRANSFORM) then
             call options%post_start%init("gregorian")
-            if (post_start=="") then
+            if (start_post=="") then
                 stop "ERROR must set a post-processing start date"
             endif
-            call options%post_start%set(post_start)
+            call options%post_start%set(start_post)
             call options%post_end%init("gregorian")
-            if (post_end=="") then
+            if (end_post=="") then
                 stop "ERROR must set a post-processing end date"
             endif
-            call options%post_end%set(post_end)
+            call options%post_end%set(end_post)
         endif
 
         options%training_file       = training_file

diff --git a/src/downscaling_stats/analogs.f90 b/src/downscaling_stats/analogs.f90
@@ -43,6 +43,10 @@ subroutine find_analogs(analogs, match, input, n, threshold, weights, skip_analo
         ! fast selection (O(n_inputs log(nanalogs)) worst case, typically closer to O(n_inputs))
         if (n>0) then
             if (.not.allocated(analogs)) allocate(analogs(n))
+            if (size(analogs) < n) then
+                deallocate(analogs)
+                allocate(analogs(n))
+            endif
             call top_n_analogs(distances, n, analogs)
         elseif (threshold>0) then
             distances = distances / nvars ! normalize by the number of variables so the supplied threshold doesn't have to changes

diff --git a/src/downscaling_stats/downscaling.f90 b/src/downscaling_stats/downscaling.f90
@@ -1,5 +1,6 @@
 module downscaling_mod
 
+    USE ieee_arithmetic
     use data_structures
     use string,             only : str
     use regression_mod,     only : compute_regression, compute_logistic_regression
@@ -9,14 +10,20 @@ module downscaling_mod
     use time_util,          only : setup_time_indices
     use basic_stats_mod,    only : stddev
     use io_routines,        only : file_exists, io_read
+    use random_mod,         only : box_muller_random
+    use sampling_mod,       only : sample_distribution
     implicit none
 
+    real,    parameter :: LOG_FILL_VALUE = 1e-30
+    real,    parameter :: MAX_ALLOWED_SIGMA = 20
+    integer, parameter :: N_RANDOM_SAMPLES = 10000
+
     integer*8, dimension(10) :: master_timers
-    real, parameter :: LOG_FILL_VALUE = 1e-30
-    real, parameter :: MAX_ALLOWED_SIGMA = 20
-    real, parameter :: N_RANDOM_SAMPLES = 10000
+
     real :: random_sample(N_RANDOM_SAMPLES)
 
+    logical :: post_process_errors = .False.
+
 contains
     subroutine downscale(training_atm, training_obs, predictors, output, options)
             implicit none
@@ -268,6 +275,21 @@ subroutine downscale(training_atm, training_obs, predictors, output, options)
                                                     output%variables(v)%logistic    (           :,           i, j),    &
                                                     current_threshold, options, timers, i,j)
 
+
+                            if (post_process_errors) then
+                                if (current_threshold /= kFILL_VALUE) then
+                                    call sample_distribution(output%variables(v)%data(:,i,j),                            &
+                                                             output%variables(v)%data(:,i,j),                            &
+                                                             output%variables(v)%errors(:,i, j),                         &
+                                                             exceedence_probability=output%variables(v)%logistic(:,i,j), &
+                                                             threshold = current_threshold)
+                                else
+                                    call sample_distribution(output%variables(v)%data(:,i,j),                            &
+                                                             output%variables(v)%data(:,i,j),                            &
+                                                             output%variables(v)%errors(:,i, j))
+                                endif
+                            endif
+
                             ! if the input data were transformed with e.g. a cube root or log transform, then reverse that transformation for the output
                             call System_Clock(timeone)
                             call transform_data(options%obs%input_Xforms(v), output%variables(v)%data(:,i,j), 1, noutput, &
@@ -560,46 +582,6 @@ subroutine transform_data(transform_type, pred_data,  &
         end select
     end subroutine transform_data
 
-    !>------------------------------------------------
-    !! Use the Box-Muller Transform to convert uniform to normal random deviates
-    !!
-    !! Note random_sample should be an allocated 1D real array
-    !! On return, random_sample will be filled with random normal (0,1) data
-    !!
-    !! Caveat: this transform is unable to generate extremely high values (>6.66)
-    !! Having switched to double precision it may do better now.
-    !!
-    !-------------------------------------------------
-    subroutine box_muller_random(random_sample)
-        implicit none
-        real, intent(inout) :: random_sample(:)
-        integer :: n,i
-
-        double precision :: u1, u2, s
-        double precision, allocatable :: double_random(:)
-
-        n = size(random_sample)
-        allocate(double_random(n))
-        call random_number(double_random)
-
-        do i=1,n,2
-            u1 = double_random(i)
-            if (i<n) then
-                u2 = double_random(i+1)
-            else
-                call random_number(u2)
-            endif
-
-            s = sqrt(-2 * log(u1))
-            random_sample(i) = s * cos(2 * kPI * u2)
-            if (i<n) then
-                random_sample(i+1) = s * sin(2 * kPI * u2)
-            endif
-
-        enddo
-
-    end subroutine box_muller_random
-
     function downscale_point(predictor, atm, obs_in, errors, output_coeff, logistic, logistic_threshold, options, timers, xpnt, ypnt) result(output)
         implicit none
         real,    dimension(:,:), intent(inout):: predictor, atm ! (ntimes, nvars)
@@ -645,7 +627,9 @@ function downscale_point(predictor, atm, obs_in, errors, output_coeff, logistic,
             return
         endif
 
-        allocate(obs, source=obs_in)
+        allocate(obs(size(obs_in)))
+        obs = obs_in
+
         if (options%time_smooth > 0) then
             do i=1, n
                 start = max(1, i - options%time_smooth)
@@ -893,12 +877,18 @@ subroutine downscale_analog_regression(x, atm, obs, output_coeff, output, error,
                     call System_Clock(timetwo)
                     timers(9) = timers(9) + (timetwo-timeone)
 
-                    ! compute the logistic as the probability of threshold exceedance in the analog population
                     call System_Clock(timeone)
-                    logistic = compute_analog_exceedance(obs, analogs(1:options%n_log_analogs), logistic_threshold)
+                    if (options%analog_weights) then
+                        ! compute the logistic as the weighted probability of threshold exceedance in the analog population
+                        logistic = compute_analog_exceedance(obs, analogs(1:options%n_log_analogs), logistic_threshold, weights)
+                    else
+                        ! compute the logistic as the probability of threshold exceedance in the analog population
+                        logistic = compute_analog_exceedance(obs, analogs(1:options%n_log_analogs), logistic_threshold)
+                    endif
                 else
 
                     if (options%analog_weights) then
+                        ! compute the logistic as the weighted probability of threshold exceedance in the analog population
                         logistic = compute_analog_exceedance(obs, analogs, logistic_threshold, weights)
                     else
                         ! compute the logistic as the probability of threshold exceedance in the analog population
@@ -908,8 +898,18 @@ subroutine downscale_analog_regression(x, atm, obs, output_coeff, output, error,
 
             else
                 logistic = compute_logistic_regression(x, regression_data, obs_analogs, coefficients, logistic_threshold)
-                where( coefficients >  1e20 ) coefficients =  1e20
-                where( coefficients < -1e20 ) coefficients = -1e20
+
+                ! Check for severe errors in the logistic regression.  This can happen with some input data.
+                ! If it fails, fall back to the analog exceedence calculation
+                if (ieee_is_nan(logistic)) then
+                    if (options%analog_weights) then
+                        logistic = compute_analog_exceedance(obs, analogs, logistic_threshold, weights)
+                    else
+                        ! compute the logistic as the probability of threshold exceedance in the analog population
+                        logistic = compute_analog_exceedance(obs, analogs, logistic_threshold)
+                    endif
+
+                endif
                 if (options%debug) then
                     do j = 1,nvars
                         output_coeff(j+nvars) = coefficients(j)
@@ -1141,6 +1141,7 @@ subroutine setup_timing(training_atm, training_obs, predictors, options)
         type(atm),    intent(inout) :: training_atm, predictors
         type(obs),    intent(inout) :: training_obs
         type(config), intent(in) :: options
+        integer :: n_atm_train, n_obs_train
 
         write(*,*) "-------------------"
         write(*,*) "Training ATM"
@@ -1152,6 +1153,13 @@ subroutine setup_timing(training_atm, training_obs, predictors, options)
         write(*,*) "Predictors"
         call setup_time_indices(predictors, options)
 
+        n_obs_train = training_obs%training_stop - training_obs%training_start
+        n_atm_train = training_atm%training_stop - training_atm%training_start
+
+        if (n_obs_train /= n_atm_train) then
+            stop "ERROR Inconsistent time periods in training atm and obs data"
+        endif
+
     end subroutine setup_timing
 
     subroutine allocate_data(output, n_obs_variables, noutput, nx, ny, n_atm_variables, tr_size, options)

diff --git a/src/downscaling_stats/regression.f90 b/src/downscaling_stats/regression.f90
@@ -33,10 +33,10 @@ function compute_regression(x, training_x, training_y, coefficients, y_test, err
 
         allocate(varlist(nvars))
         varlist = -1
-        varlist(1) = 1   ! this assumes there is a constant passed in for the first variable... 
-        useable_vars = 1 ! the constant 
-        
-        ! find all vars that have some variability through time. 
+        varlist(1) = 1   ! this assumes there is a constant passed in for the first variable...
+        useable_vars = 1 ! the constant
+
+        ! find all vars that have some variability through time.
         do i=2,nvars
             nonzero_count = 0
             do j=1,ntimes
@@ -45,7 +45,7 @@ function compute_regression(x, training_x, training_y, coefficients, y_test, err
                 endif
             enddo
             ! need at least two data points that are different from the first data point
-            ! to ensure that the regression on that variable might be useful. 
+            ! to ensure that the regression on that variable might be useful.
             if ( nonzero_count > 2 ) then
                 useable_vars = useable_vars + 1
                 varlist(i) = i
@@ -116,14 +116,17 @@ function compute_regression(x, training_x, training_y, coefficients, y_test, err
             if (present(weights)) then
                 if (allocated(weights)) then
                     if (size(weights)/=size(training_y_lp)) then
+                        !$omp critical (print_lock)
                         write(*,*) "ERROR size of weights /= data"
                         write(*,*), shape(weights), shape(training_y_lp)
+                        !$omp end critical (print_lock)
                     endif
                     if (present(y_test)) then
                         error = sqrt( sum(((training_y_lp - y_test)**2)*weights) / sum(weights) )
                     else
                         error = sqrt( sum(((training_y_lp - training_y)**2)*weights) / sum(weights) )
                     endif
+
                 else
                     if (present(y_test)) then
                         error = sqrt( sum((training_y_lp - y_test)**2) / ntimes )
@@ -171,6 +174,7 @@ subroutine lapack_least_squares(X, Y, B, working_space, info)
 
         nrhs = 1
 
+        ! passing working_space in and out *could* be more efficient.  Only used in tests right now
         if (present(working_space)) then
             ! First find the optimal work size (LWORK)
             LWORK = -1
@@ -189,13 +193,6 @@ subroutine lapack_least_squares(X, Y, B, working_space, info)
             CALL SGELS( 'N', M, N, NRHS, X, LDX, Y, LDY, WORK, LWORK, innerINFO )
         endif
         if (innerINFO/=0) then
-            ! !$omp critical (print_lock)
-            ! if (innerINFO<0) then
-            !     write(*,*) "ERROR in SGELS with argument:", 0-innerINFO
-            ! else
-            !     write(*,*) "ERROR in SGELS A does not have full rank, position:",innerINFO
-            ! endif
-            ! !$omp end critical (print_lock)
             Y(1) = sum(Y)/size(Y)
             Y(2:)= 0
         endif