From 217f1ccb35d25d8381576de51e36414d677634bf Mon Sep 17 00:00:00 2001 From: Daniel Dresser Date: Fri, 8 Sep 2023 18:01:10 -0700 Subject: [PATCH] Resample : Optimize inseparable case with no scaling --- Changes.md | 5 ++ python/GafferImageTest/ResampleTest.py | 21 +++++ src/GafferImage/Resample.cpp | 115 +++++++++++++++++++++++-- 3 files changed, 133 insertions(+), 8 deletions(-) diff --git a/Changes.md b/Changes.md index 1706ffbae79..62a4e027123 100644 --- a/Changes.md +++ b/Changes.md @@ -6,6 +6,11 @@ Features - Viewer : Added "Snapshot To Catalogue" command to the right-click menu of the 3D view. +Improvements +------------ + +- ImageTransform, Resample : Improved performance for non-separable filters without scaling, with 2-6x speedups in some benchmark cases. + Fixes ----- diff --git a/python/GafferImageTest/ResampleTest.py b/python/GafferImageTest/ResampleTest.py index c8d9eedb5f7..1701602e97a 100644 --- a/python/GafferImageTest/ResampleTest.py +++ b/python/GafferImageTest/ResampleTest.py @@ -154,6 +154,27 @@ def __test( fileName, size, filter ) : with self.subTest( fileName = args[0], size = args[1], ftilter = args[2] ): __test( *args ) + def testInseparableFastPath( self ) : + + reader = GafferImage.ImageReader() + reader["fileName"].setValue( self.imagesPath() / "resamplePatterns.exr" ) + + # When applying an inseparable filter with no scaling, we can use a much faster code path. + # This code path should not have any effect on the result + resampleFastPath = GafferImage.Resample() + resampleFastPath["in"].setInput( reader["out"] ) + resampleFastPath['filterScale'].setValue( imath.V2f( 4 ) ) + resampleFastPath["filter"].setValue( "radial-lanczos3" ) + + # Force the slow code path using the "debug" parameter + resampleReference = GafferImage.Resample() + resampleReference["in"].setInput( reader["out"] ) + resampleReference['filterScale'].setValue( imath.V2f( 4 ) ) + resampleReference["filter"].setValue( "radial-lanczos3" ) + resampleReference["debug"].setValue( GafferImage.Resample.Debug.SinglePass ) + + self.assertImagesEqual( resampleFastPath["out"], resampleReference["out"] ) + def testSincUpsize( self ) : c = GafferImage.Constant() diff --git a/src/GafferImage/Resample.cpp b/src/GafferImage/Resample.cpp index 567602a66c2..ace0e3acddc 100644 --- a/src/GafferImage/Resample.cpp +++ b/src/GafferImage/Resample.cpp @@ -64,10 +64,14 @@ enum Passes { Horizontal = 1, Vertical = 2, - Both = Horizontal | Vertical + Both = Horizontal | Vertical, + + // Special pass label when we must compute both passes in one, but there is no scaling. + // This allows a special code path which is up to 6X faster. + BothOptimized = Both | 4 }; -unsigned requiredPasses( const Resample *resample, const ImagePlug *image, const OIIO::Filter2D *filter ) +unsigned requiredPasses( const Resample *resample, const ImagePlug *image, const OIIO::Filter2D *filter, V2f &ratio ) { int debug = resample->debugPlug()->getValue(); if( debug == Resample::HorizontalPass ) @@ -76,12 +80,24 @@ unsigned requiredPasses( const Resample *resample, const ImagePlug *image, const } else if( debug == Resample::SinglePass ) { + // For a SinglePass debug mode, we always use Both. + // Note that we don't use the optimized pass here, even if the ratio is 1 - we want debug to always + // use the same path. return Horizontal | Vertical; } if( image == image->parent()->outPlug() ) { - return filter->separable() ? Vertical : Both; + if( filter->separable() ) + { + return Vertical; + } + else + { + // The filter isn't separable, so we must process everything at once. If the ratio has no + // scaling though, we can use the optimized path. + return ( ratio == V2f( 1.0 ) ) ? BothOptimized : Both; + } } return Horizontal; } @@ -193,7 +209,7 @@ const OIIO::Filter2D *filterAndScale( const std::string &name, V2f ratio, V2f &i /// only computed once and then reused. At the time of writing, profiles indicate that /// accessing pixels via the Sampler is the main bottleneck, but once that is optimised /// perhaps cached filter weights could have a benefit. -void filterWeights( const OIIO::Filter2D *filter, const float inputFilterScale, const float filterRadius, const int x, const float ratio, const float offset, Passes pass, std::vector &supportRanges, std::vector &weights ) +void filterWeights1D( const OIIO::Filter2D *filter, const float inputFilterScale, const float filterRadius, const int x, const float ratio, const float offset, Passes pass, std::vector &supportRanges, std::vector &weights ) { weights.reserve( ( 2 * ceilf( filterRadius ) + 1 ) * ImagePlug::tileSize() ); supportRanges.reserve( 2 * ImagePlug::tileSize() ); @@ -221,6 +237,42 @@ void filterWeights( const OIIO::Filter2D *filter, const float inputFilterScale, } } +// For the inseparable case, we can't always reuse the weights for an adjacent row or column. +// There are a lot of possible scaling factors where the ratio can be represented as a fraction, +// and the weights needed would repeat after a certain number of pixels, and we could compute weights +// for a limited section of pixels, and reuse them in a tiling way. +// That's a bit complicated though, so we're just handling the simplest case currently ( since it is +// a common case ): +// if there is no scaling, then we only need to compute the weights for one pixel, and we can reuse them +// for all pixels. This means we don't loop over output pixels at all here - we just compute the weights +// for one output pixel, and return one 2D support for this pixel - it just gets shifted for each adjacent +// pixel. +void filterWeights2D( const OIIO::Filter2D *filter, const V2f inputFilterScale, const V2f filterRadius, const V2i p, const V2f offset, Box2i &support, std::vector &weights ) +{ + weights.reserve( ( 2 * ceilf( filterRadius.x ) + 1 ) * ( 2 * ceilf( filterRadius.y ) + 1 ) ); + + const V2f filterCoordinateMult( 1.0f / inputFilterScale.x, 1.0f / inputFilterScale.y ); + + // input pixel position (floating point) + V2f i = V2f( p ) + V2f( 0.5 ) + offset; + + support = Box2i( + V2i( ceilf( i.x - 0.5f - filterRadius.x ), ceilf( i.y - 0.5f - filterRadius.y ) ), + V2i( floorf( i.x + 0.5f + filterRadius.x ), floorf( i.y + 0.5f + filterRadius.y ) ) + ); + + for( int fY = support.min.y; fY < support.max.y; ++fY ) + { + const float fy = filterCoordinateMult.y * ( float( fY ) + 0.5 - i.y ); + for( int fX = support.min.x; fX < support.max.x; ++fX ) + { + const float fx = filterCoordinateMult.x * ( float( fX ) + 0.5f - i.x ); + const float w = (*filter)( fx, fy ); + weights.push_back( w ); + } + } +} + Box2f transform( const Box2f &b, const M33f &m ) { if( b.isEmpty() ) @@ -477,7 +529,7 @@ void Resample::hashChannelData( const GafferImage::ImagePlug *parent, const Gaff filterPlug()->hash( h ); - const unsigned passes = requiredPasses( this, parent, filter ); + const unsigned passes = requiredPasses( this, parent, filter, ratio ); if( passes & Horizontal ) { h.append( inputFilterScale.x ); @@ -491,6 +543,12 @@ void Resample::hashChannelData( const GafferImage::ImagePlug *parent, const Gaff h.append( offset.y ); } + if( passes == BothOptimized ) + { + // Append an extra flag so our hash reflects that we are going to take the optimized path + h.append( true ); + } + const V2i tileOrigin = context->get( ImagePlug::tileOriginContextName ); Sampler sampler( passes == Vertical ? horizontalPassPlug() : inPlug(), @@ -518,7 +576,7 @@ IECore::ConstFloatVectorDataPtr Resample::computeChannelData( const std::string const OIIO::Filter2D *filter = filterAndScale( filterPlug()->getValue(), ratio, inputFilterScale ); inputFilterScale *= filterScalePlug()->getValue(); - const unsigned passes = requiredPasses( this, parent, filter ); + const unsigned passes = requiredPasses( this, parent, filter, ratio ); Sampler sampler( passes == Vertical ? horizontalPassPlug() : inPlug(), @@ -588,6 +646,47 @@ IECore::ConstFloatVectorDataPtr Resample::computeChannelData( const std::string } } } + else if( passes == BothOptimized ) + { + Box2i support; + std::vector weights; + filterWeights2D( filter, inputFilterScale, filterRadius, tileBound.min, offset, support, weights ); + + V2i oP; // output pixel position + V2i supportOffset; + for( oP.y = tileBound.min.y; oP.y < tileBound.max.y; ++oP.y ) + { + supportOffset.y = oP.y - tileBound.min.y; + + for( oP.x = tileBound.min.x; oP.x < tileBound.max.x; ++oP.x ) + { + Canceller::check( context->canceller() ); + + supportOffset.x = oP.x - tileBound.min.x; + std::vector::const_iterator wIt = weights.begin(); + + float v = 0.0f; + float totalW = 0.0f; + sampler.visitPixels( + Imath::Box2i( support.min + supportOffset, support.max + supportOffset ), + [&wIt, &v, &totalW]( float cur, int x, int y ) + { + const float w = *wIt++; + v += w * cur; + totalW += w; + } + ); + + if( totalW != 0.0f ) + { + *pIt = v / totalW; + } + + ++pIt; + } + } + + } else if( passes == Horizontal ) { // When the filter is separable we can perform filtering in two @@ -600,7 +699,7 @@ IECore::ConstFloatVectorDataPtr Resample::computeChannelData( const std::string // we precompute the weights now to avoid repeating work later. std::vector supportRanges; std::vector weights; - filterWeights( filter, inputFilterScale.x, filterRadius.x, tileBound.min.x, ratio.x, offset.x, Horizontal, supportRanges, weights ); + filterWeights1D( filter, inputFilterScale.x, filterRadius.x, tileBound.min.x, ratio.x, offset.x, Horizontal, supportRanges, weights ); V2i oP; // output pixel position @@ -646,7 +745,7 @@ IECore::ConstFloatVectorDataPtr Resample::computeChannelData( const std::string // we precompute the weights now to avoid repeating work later. std::vector supportRanges; std::vector weights; - filterWeights( filter, inputFilterScale.y, filterRadius.y, tileBound.min.y, ratio.y, offset.y, Vertical, supportRanges, weights ); + filterWeights1D( filter, inputFilterScale.y, filterRadius.y, tileBound.min.y, ratio.y, offset.y, Vertical, supportRanges, weights ); std::vector::const_iterator supportIt = supportRanges.begin(); std::vector::const_iterator rowWeightsIt = weights.begin();