diff --git a/drivers/include/drivers/SPI.h b/drivers/include/drivers/SPI.h
index 69df632d333..58e79f9f16d 100644
--- a/drivers/include/drivers/SPI.h
+++ b/drivers/include/drivers/SPI.h
@@ -26,6 +26,7 @@
 #include "drivers/DigitalOut.h"
 #include "platform/SingletonPtr.h"
 #include "platform/NonCopyable.h"
+#include "platform/CacheAlignedBuffer.h"
 
 #if defined MBED_CONF_DRIVERS_SPI_COUNT_MAX && DEVICE_SPI_COUNT > MBED_CONF_DRIVERS_SPI_COUNT_MAX
 #define SPI_PERIPHERALS_USED MBED_CONF_DRIVERS_SPI_COUNT_MAX
@@ -194,6 +195,11 @@ const use_gpio_ssel_t use_gpio_ssel;
  * the transfer but others can execute).  Here's a sample of how to send the same data as above
  * using the blocking async API:</p>
  *
+ * <p>Note that when using the asynchronous API, you must use the CacheAlignedBuffer class when declaring the
+ * receive buffer.  This is because some processors' async SPI implementations require the received buffer to
+ * be at an address which is aligned to the processor cache line size.  CacheAlignedBuffer takes care of this
+ * for you and provides functions (data(), begin(), end()) to access the underlying data in the buffer.</p>
+ *
  * @code
  * #include "mbed.h"
  *
@@ -203,8 +209,8 @@ const use_gpio_ssel_t use_gpio_ssel;
  *     device.format(8, 0);
  *
  *     uint8_t command[2] = {0x0A, 0x0B};
- *     uint8_t response[2];
- *     int result = device.transfer_and_wait(command, sizeof(command), response, sizeof(response));
+ *     CacheAlignedBuffer<uint8_t, 2> response;
+ *     int result = device.transfer_and_wait(command, sizeof(command), response, sizeof(command));
  * }
  * @endcode
  *
@@ -458,8 +464,9 @@ class SPI : private NonCopyable<SPI> {
      * @param tx_buffer The TX buffer with data to be transferred. If NULL is passed,
      *                  the default %SPI value is sent.
      * @param tx_length The length of TX buffer in bytes.
-     * @param rx_buffer The RX buffer which is used for received data. If NULL is passed,
-     *                  received data are ignored.
+     * @param rx_buffer The RX buffer which is used for received data.  Rather than a C array, a CacheAlignedBuffer
+     *                  structure must be passed so that cache alignment can be handled for data received from DMA.
+     *                  May be nullptr if rx_length is 0.
      * @param rx_length The length of RX buffer in bytes.
      * @param callback  The event callback function.
      * @param event     The logical OR of events to subscribe to.  May be #SPI_EVENT_ALL, or some combination
@@ -471,16 +478,18 @@ class SPI : private NonCopyable<SPI> {
      */
     template<typename WordT>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
-    transfer(const WordT *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
+    transfer(const WordT *tx_buffer, int tx_length, CacheAlignedBuffer<WordT> &rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
     {
-        return transfer_internal(tx_buffer, tx_length, rx_buffer, rx_length, callback, event);
+        MBED_ASSERT(rx_length <= static_cast<int>(rx_buffer.capacity()));
+        return transfer_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, callback, event);
     }
 
     // Overloads of the above to support passing nullptr
     template<typename WordT>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
-    transfer(const std::nullptr_t *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
+    transfer(const std::nullptr_t *tx_buffer, int tx_length, CacheAlignedBuffer<WordT> &rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
     {
+        MBED_ASSERT(rx_length <= static_cast<int>(rx_buffer.capacity()));
         return transfer_internal(tx_buffer, tx_length, rx_buffer, rx_length, callback, event);
     }
     template<typename WordT>
@@ -502,8 +511,10 @@ class SPI : private NonCopyable<SPI> {
      * Internally, the chip vendor may implement this function using either DMA or interrupts.
      *
      * @param tx_buffer The TX buffer with data to be transferred.  May be nullptr if tx_length is 0.
-     * @param tx_length The length of TX buffer in bytes.  If 0, no transmission is done.
-     * @param rx_buffer The RX buffer, which is used for received data.  May be nullptr if tx_length is 0.
+     * @param tx_length The length of TX buffer in bytes.  If 0, the default %SPI data value is sent when receiving data.
+     * @param rx_buffer The RX buffer which is used for received data.  Rather than a C array, a CacheAlignedBuffer
+     *                  structure must be passed so that cache alignment can be handled for data received from DMA.
+     *                  May be nullptr if rx_length is 0.
      * @param rx_length The length of RX buffer in bytes  If 0, no reception is done.
      * @param timeout timeout value.  Use #rtos::Kernel::wait_for_u32_forever to wait forever (the default).
      *
@@ -515,17 +526,19 @@ class SPI : private NonCopyable<SPI> {
      */
     template<typename WordT>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
-    transfer_and_wait(const WordT *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
+    transfer_and_wait(const WordT *tx_buffer, int tx_length, CacheAlignedBuffer<WordT> &rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
     {
-        return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer, rx_length, timeout);
+        MBED_ASSERT(rx_length <= static_cast<int>(rx_buffer.capacity()));
+        return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, timeout);
     }
 
     // Overloads of the above to support passing nullptr
     template<typename WordT>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
-    transfer_and_wait(const std::nullptr_t *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
+    transfer_and_wait(const std::nullptr_t *tx_buffer, int tx_length, CacheAlignedBuffer<WordT> &rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
     {
-        return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer, rx_length, timeout);
+        MBED_ASSERT(rx_length <= static_cast<int>(rx_buffer.capacity()));
+        return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, timeout);
     }
     template<typename WordT>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
@@ -574,7 +587,8 @@ class SPI : private NonCopyable<SPI> {
      *                  the default SPI value is sent
      * @param tx_length The length of TX buffer in bytes.
      * @param rx_buffer The RX buffer which is used for received data. If NULL is passed,
-     *                  received data are ignored.
+     *                  received data are ignored.  This buffer is guaranteed to be cache aligned
+     *                  if the MCU has a cache.
      * @param rx_length The length of RX buffer in bytes.
      * @param callback  The event callback function.
      * @param event     The event mask of events to modify.
@@ -599,6 +613,7 @@ class SPI : private NonCopyable<SPI> {
      * @param tx_buffer The TX buffer with data to be transferred.  May be nullptr if tx_length is 0.
      * @param tx_length The length of TX buffer in bytes.  If 0, no transmission is done.
      * @param rx_buffer The RX buffer, which is used for received data.  May be nullptr if tx_length is 0.
+     *     This buffer is guaranteed to be cache aligned if the MCU has a cache.
      * @param rx_length The length of RX buffer in bytes  If 0, no reception is done.
      * @param timeout timeout value.  Use #rtos::Kernel::wait_for_u32_forever to wait forever (the default).
      *
@@ -722,6 +737,18 @@ class SPI : private NonCopyable<SPI> {
      * iff start_transfer() has been called and the chip has been selected but irq_handler_asynch()
      * has NOT been called yet. */
     volatile bool _transfer_in_progress = false;
+
+    // If there is a transfer in progress, this indicates whether it used DMA and therefore requires a cache
+    // flush at the end
+    bool _transfer_in_progress_uses_dma;
+
+#if __DCACHE_PRESENT
+    // These variables store the location and length in bytes of the Rx buffer if an async transfer
+    // is in progress.  They are used for invalidating the cache after the transfer completes.
+    void *_transfer_in_progress_rx_buffer;
+    size_t _transfer_in_progress_rx_len;
+#endif
+
     /* Event flags used for transfer_and_wait() */
     rtos::EventFlags _transfer_and_wait_flags;
 #endif // DEVICE_SPI_ASYNCH
diff --git a/drivers/source/SPI.cpp b/drivers/source/SPI.cpp
index 376ec7c708b..a0339487215 100644
--- a/drivers/source/SPI.cpp
+++ b/drivers/source/SPI.cpp
@@ -396,6 +396,16 @@ void SPI::abort_transfer()
             _set_ssel(1);
         }
 
+#if __DCACHE_PRESENT
+        if (_transfer_in_progress_uses_dma && _transfer_in_progress_rx_len > 0) {
+            // If the cache is present, invalidate the Rx data so it's loaded from main RAM.
+            // We only want to do this if DMA actually got used for the transfer because, if interrupts
+            // were used instead, the cache might have the correct data and NOT the main memory.
+            SCB_InvalidateDCache_by_Addr(_transfer_in_progress_rx_buffer, _transfer_in_progress_rx_len);
+        }
+#endif
+        _transfer_in_progress = false;
+
 #if MBED_CONF_DRIVERS_SPI_TRANSACTION_QUEUE_LEN
         dequeue_transaction();
 #endif
@@ -466,8 +476,31 @@ void SPI::start_transfer(const void *tx_buffer, int tx_length, void *rx_buffer,
 
     _callback = callback;
     _irq.callback(&SPI::irq_handler_asynch);
+
+#if __DCACHE_PRESENT
+    // On devices with a cache, we need to carefully manage the Tx and Rx buffer cache invalidation.
+    // We can assume that asynchronous SPI implementations might rely on DMA, and that DMA will
+    // not interoperate with the CPU cache.  So, manual flushing/invalidation will be required.
+    // This page is very useful for how to do this correctly:
+    // https://community.st.com/t5/stm32-mcus-products/maintaining-cpu-data-cache-coherence-for-dma-buffers/td-p/95746
+    if (tx_length > 0) {
+        // For chips with a cache, we need to evict the Tx data from cache to main memory.
+        // This ensures that the DMA controller can see the most up-to-date copy of the data.
+        SCB_CleanDCache_by_Addr(const_cast<void *>(tx_buffer), tx_length);
+    }
+
+    // Additionally, we have to make sure that there aren't any pending changes which could be written back
+    // to the Rx buffer memory by the cache at a later date, corrupting the DMA results.
+    if (rx_length > 0) {
+        SCB_InvalidateDCache_by_Addr(rx_buffer, rx_length);
+    }
+    _transfer_in_progress_rx_buffer = rx_buffer;
+    _transfer_in_progress_rx_len = rx_length;
+#endif
+
     _transfer_in_progress = true;
-    spi_master_transfer(&_peripheral->spi, tx_buffer, tx_length, rx_buffer, rx_length, bit_width, _irq.entry(), event, _usage);
+
+    _transfer_in_progress_uses_dma = spi_master_transfer(&_peripheral->spi, tx_buffer, tx_length, rx_buffer, rx_length, bit_width, _irq.entry(), event, _usage);
 }
 
 void SPI::lock_deep_sleep()
@@ -508,7 +541,17 @@ void SPI::dequeue_transaction()
 void SPI::irq_handler_asynch(void)
 {
     int event = spi_irq_handler_asynch(&_peripheral->spi);
-    if (_callback && (event & SPI_EVENT_ALL)) {
+    if ((event & SPI_EVENT_ALL)) {
+
+#if __DCACHE_PRESENT
+        if (_transfer_in_progress_uses_dma && _transfer_in_progress_rx_len > 0) {
+            // If the cache is present, invalidate the Rx data so it's loaded from main RAM.
+            // We only want to do this if DMA actually got used for the transfer because, if interrupts
+            // were used instead, the cache might have the correct data and NOT the main memory.
+            SCB_InvalidateDCache_by_Addr(_transfer_in_progress_rx_buffer, _transfer_in_progress_rx_len);
+        }
+#endif
+
         // If using GPIO CS mode, unless we were asked to keep the peripheral selected, deselect it.
         // If there's another transfer queued, we *do* want to deselect the peripheral now.
         // It will be reselected in start_transfer() which is called by dequeue_transaction() below.
@@ -518,7 +561,10 @@ void SPI::irq_handler_asynch(void)
         _transfer_in_progress = false;
 
         unlock_deep_sleep();
-        _callback.call(event & SPI_EVENT_ALL);
+
+        if (_callback) {
+            _callback.call(event & SPI_EVENT_ALL);
+        }
     }
 #if MBED_CONF_DRIVERS_SPI_TRANSACTION_QUEUE_LEN
     if (event & (SPI_EVENT_ALL | SPI_EVENT_INTERNAL_TRANSFER_COMPLETE)) {
diff --git a/hal/include/hal/spi_api.h b/hal/include/hal/spi_api.h
index 65853c69b3b..011073a02a7 100644
--- a/hal/include/hal/spi_api.h
+++ b/hal/include/hal/spi_api.h
@@ -25,6 +25,8 @@
 #include "hal/dma_api.h"
 #include "hal/buffer.h"
 
+#include <stdbool.h>
+
 #if DEVICE_SPI
 
 /**
@@ -393,7 +395,11 @@ const PinMap *spi_slave_cs_pinmap(void);
  * @{
  */
 
-/** Begin the SPI transfer. Buffer pointers and lengths are specified in tx_buff and rx_buff
+/** Begin the asynchronous SPI transfer. Buffer pointers and lengths are specified in tx_buff and rx_buff.
+ *
+ * If the device has a data cache, the Tx data is guaranteed to have been flushed from the cache
+ * to main memory already.  Additionally, the Rx buffer is guaranteed to be cache aligned, and will
+ * be invalidated by the SPI layer after the transfer is complete.
  *
  * @param[in] obj       The SPI object that holds the transfer information
  * @param[in] tx        The transmit buffer
@@ -404,8 +410,16 @@ const PinMap *spi_slave_cs_pinmap(void);
  * @param[in] event     The logical OR of events to be registered
  * @param[in] handler   SPI interrupt handler
  * @param[in] hint      A suggestion for how to use DMA with this transfer
+ *
+ * @return True if DMA was actually used for the transfer, false otherwise (if interrupts or another CPU-based
+ * method is used to do the transfer).
+ *
+ * @note On MCUs with a data cache, the return value is used to determine if a cache invalidation needs to be done
+ * after the transfer is complete.  If this function returns true, the driver layer will cache invalidate the Rx buffer under
+ * the assumption that the data needs to be re-read from main memory.  Be careful, because if the read was not actually
+ * done by DMA, and the rx data is in the CPU cache, this invalidation will corrupt it.
  */
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint);
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint);
 
 /** The asynchronous IRQ handler
  *
diff --git a/platform/include/platform/CacheAlignedBuffer.h b/platform/include/platform/CacheAlignedBuffer.h
new file mode 100644
index 00000000000..50b418b7de8
--- /dev/null
+++ b/platform/include/platform/CacheAlignedBuffer.h
@@ -0,0 +1,340 @@
+/* mbed Microcontroller Library
+ * Copyright (c) 2006-2023 ARM Limited
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MBED_CACHEALIGNEDBUFFER_H
+#define MBED_CACHEALIGNEDBUFFER_H
+
+#include <cstdlib>
+
+#include "device.h"
+
+namespace mbed {
+
+namespace detail::cab {
+/**
+ * @brief Calculate the needed capacity for a cache aligned buffer's backing buffer based on the
+ *    needed capacity and element size.
+ *
+ * @param neededCapacity Capacity needed for the buffer
+ * @param elementSize Size of each element
+ *
+ * @return Needed backing buffer size
+ */
+constexpr inline size_t getNeededBackingBufferSize(size_t neededCapacity, size_t elementSize)
+{
+#if __DCACHE_PRESENT
+    // Allocate enough extra space that we can shift the start of the buffer towards higher addresses to be on a cache line.
+    // The worst case for this is when the first byte is allocated 1 byte past the start of a cache line, so we
+    // will need an additional (cache line size - 1) bytes.
+    // Additionally, since we are going to be invalidating this buffer, we can't allow any other variables to be
+    // in the same cache lines, or they might get corrupted.
+    // So, we need to round up the backing buffer size to the nearest multiple of the cache line size.
+    // The math for rounding up can be found here:
+    // https://community.st.com/t5/stm32-mcus-products/maintaining-cpu-data-cache-coherence-for-dma-buffers/td-p/95746
+    size_t requiredSizeRoundedUp = (neededCapacity * elementSize + __SCB_DCACHE_LINE_SIZE - 1) & ~((__SCB_DCACHE_LINE_SIZE) - 1);
+    return requiredSizeRoundedUp + __SCB_DCACHE_LINE_SIZE - 1;
+#else
+    // No cache on this platform so don't need any extra space.
+    return neededCapacity * elementSize;
+#endif
+}
+}
+
+/** \addtogroup platform-public-api */
+/** @{*/
+/**
+ * \defgroup platform_CacheAlignedBuffer CacheAlignedBuffer class
+ * @{
+ */
+
+/**
+ * @brief CacheAlignedBuffer is used by Mbed in locations where we need a cache-aligned buffer.
+ *
+ * <p>Cache alignment is desirable in several different situations in embedded programming -- one common
+ * use is when working with DMA or other peripherals which write their results back to main memory.
+ * After these peripherals do their work, the data will be correct in main memory, but the CPU cache
+ * might also contain a value cached from that memory which is now incorrect.<p>
+ *
+ * <p>In order to read those results from memory without risk of getting old data from the
+ * CPU cache, one needs to align the buffer so it takes up an integer number of cache lines,
+ * then invalidate the cache lines so that the data gets reread from RAM.</p>
+ *
+ * <p>%CacheAlignedBuffer provides an easy way to allocate the correct amount of space so that
+ * a buffer of any size can be made cache-aligned.  To instantiate a %CacheAlignedBuffer, create one of its
+ * subtypes, StaticCacheAlignedBuffer or DynamicCacheAlignedBuffer.</p>
+ *
+ * <h2> Converting Code to use CacheAlignedBuffer </h2>
+ * For code using static arrays, like this:
+ * @code
+ * uint8_t rxBuffer[16];
+ * spi.transfer_and_wait(nullptr, 0, rxBuffer, 16);
+ * @endcode
+ * Use a StaticCacheAlignedBuffer:
+ * @code
+ * StaticCacheAlignedBuffer<uint8_t, 16> rxBuffer;
+ * spi.transfer_and_wait(nullptr, 0, rxBuffer, 16);
+ * @endcode
+ * For code using dynamic allocation to handle unknown buffer sizes, like:
+ * @code
+ * uint16_t * rxBuffer = new uint16_t[bufferSize];
+ * spi.transfer_and_wait(nullptr, 0, rxBuffer, bufferSize);
+ * ...
+ * delete[] rxBuffer;
+ * @endcode
+ * use a DynamicCacheAlignedBuffer:
+ * @code
+ * DynamicCacheAlignedBuffer<uint16_t> rxBuffer(bufferSize);
+ * spi.transfer_and_wait(nullptr, 0, rxBuffer, bufferSize);
+ * @endcode
+ *
+ * @tparam DataT Type of the data to store in the buffer.  Note: %CacheAlignedBuffer is not designed for
+ *    using class types as DataT, and will not call constructors.
+ */
+template<typename DataT>
+class CacheAlignedBuffer {
+
+protected:
+    /// Pointer to the aligned buffer.  Must be set in each constructor of each subclass.
+    DataT *_alignedBufferPtr;
+
+    /// Capacity of the aligned buffer, in terms of number of DataT elements
+    size_t _alignedBufferCapacity;
+
+    // Protected constructor to block instantiation
+    CacheAlignedBuffer() = default;
+
+    /**
+     * Find and return the first location in the given buffer that starts on a cache line.
+     * If this MCU does not use a cache, this function is a no-op.
+     *
+     * @param buffer Pointer to buffer
+     *
+     * @return Pointer to first data item, aligned at the start of a cache line.
+     */
+    static inline DataT *findCacheLineStart(uint8_t *buffer)
+    {
+#if __DCACHE_PRESENT
+        // Use integer division to divide the address down to the cache line size, which
+        // rounds to the cache line before the given address.
+        // So that we always go one cache line back even if the given address is on a cache line,
+        // subtract 1.
+        ptrdiff_t prevCacheLine = ((ptrdiff_t)(buffer - 1)) / __SCB_DCACHE_LINE_SIZE;
+
+        // Now we just have to multiply up again to get an address (adding 1 to go forward by 1 cache line)
+        return reinterpret_cast<DataT *>((prevCacheLine + 1) * __SCB_DCACHE_LINE_SIZE);
+#else
+        return reinterpret_cast<DataT *>(buffer);
+#endif
+    }
+
+public:
+
+    // Iterator types
+    typedef DataT *iterator;
+    typedef DataT const *const_iterator;
+
+    /**
+     * @brief Get a pointer to the aligned data array inside the buffer
+     */
+    DataT *data()
+    {
+        return _alignedBufferPtr;
+    }
+
+    /**
+     * @brief Get a pointer to the aligned data array inside the buffer (const version)
+     */
+    DataT const *data() const
+    {
+        return _alignedBufferPtr;
+    }
+
+    /**
+     * @brief Element access
+     */
+    DataT &operator[](size_t index)
+    {
+        return _alignedBufferPtr[index];
+    }
+
+    /**
+     * @brief Element access (const)
+     */
+    DataT operator[](size_t index) const
+    {
+        return _alignedBufferPtr[index];
+    }
+
+    /**
+     * @brief Get iterator for start of buffer
+     */
+    iterator begin()
+    {
+        return _alignedBufferPtr;
+    }
+
+    /**
+     * @brief Get iterator for start of buffer
+     */
+    const_iterator begin() const
+    {
+        return _alignedBufferPtr;
+    }
+
+    /**
+     * @brief Get iterator for end of buffer
+     */
+    iterator end()
+    {
+        return _alignedBufferPtr + _alignedBufferCapacity;
+    }
+
+    /**
+     * @brief Get iterator for end of buffer
+     */
+    const_iterator end() const
+    {
+        return _alignedBufferPtr + _alignedBufferCapacity;
+    }
+
+    /**
+     * @return The maximum amount of DataT elements that this buffer can hold.
+     */
+    constexpr size_t capacity()
+    {
+        return _alignedBufferCapacity;
+    }
+};
+
+/**
+ * @brief CacheAlignedBuffer type designed for static allocation.
+ *
+ * Use a StaticCacheAlignedBuffer when you want to create a cache-aligned buffer with a fixed size
+ * at compile time.  %StaticCacheAlignedBuffers can be declared globally, as local variables, or using
+ * new[] and delete[].
+ *
+ * @tparam DataT Type of the data to store in the buffer.  Note: %CacheAlignedBuffer is not designed for
+ *    using class types as DataT, and will not call constructors.
+ * @tparam BufferSize Buffer size (number of elements) needed by the application for the buffer.
+ */
+template<typename DataT, size_t BufferSize>
+class StaticCacheAlignedBuffer : public CacheAlignedBuffer<DataT> {
+private:
+
+    uint8_t _backingBuffer[detail::cab::getNeededBackingBufferSize(BufferSize, sizeof(DataT))];
+
+public:
+
+    /**
+     * @brief Construct new cache-aligned buffer.  Buffer will be zero-initialized.
+     */
+    StaticCacheAlignedBuffer():
+        _backingBuffer{}
+    {
+        this->_alignedBufferPtr = this->findCacheLineStart(_backingBuffer);
+        this->_alignedBufferCapacity = BufferSize;
+    }
+
+    /**
+     * @brief Copy from other cache-aligned buffer.  Buffer memory will be copied.
+     */
+    StaticCacheAlignedBuffer(StaticCacheAlignedBuffer const &other)
+    {
+        this->_alignedBufferPtr = this->findCacheLineStart(_backingBuffer);
+        memcpy(this->_alignedBufferPtr, other._alignedBufferPtr, BufferSize * sizeof(DataT));
+        this->_alignedBufferCapacity = BufferSize;
+    }
+
+    /**
+     * @brief Assign from other cache-aligned buffer.  Buffer memory will be assigned.
+     *
+     * Only a buffer with the same data type and size can be assigned.
+     */
+    StaticCacheAlignedBuffer &operator=(StaticCacheAlignedBuffer<DataT, BufferSize> const &other)
+    {
+        memmove(this->_alignedBufferPtr, other._alignedBufferPtr, BufferSize * sizeof(DataT));
+    }
+};
+
+/**
+ * @brief CacheAlignedBuffer type which allocates its backing buffer on the heap.
+ *
+ * Use a DynamicCacheAlignedBuffer when you want to create a cache-aligned buffer with a size
+ * known only at runtime.  When constructed, %DynamicCacheAlignedBuffers allocate backing memory off the
+ * heap for the provided number of elements.  The memory will be released when the buffer object is destroyed.
+ *
+ * @tparam DataT Type of the data to store in the buffer.  Note: %CacheAlignedBuffer is not designed for
+ *    using class types as DataT, and will not call constructors.
+ */
+template<typename DataT>
+class DynamicCacheAlignedBuffer : public CacheAlignedBuffer<DataT> {
+    uint8_t *_heapMem;
+public:
+    /**
+     * @brief Construct new cache-aligned buffer.  Buffer will be zero-initialized and allocated from the heap.
+     *
+     * @param capacity Number of elements the buffer shall hold
+     */
+    explicit DynamicCacheAlignedBuffer(size_t capacity):
+        _heapMem(new uint8_t[detail::cab::getNeededBackingBufferSize(capacity, sizeof(DataT))]())
+    {
+        this->_alignedBufferPtr = this->findCacheLineStart(_heapMem);
+        this->_alignedBufferCapacity = capacity;
+    }
+
+    /**
+     * @brief Copy from other cache-aligned buffer.  A new backing buffer will be allocated on the heap and
+     * its data will be copied from the other buffer.
+     */
+    DynamicCacheAlignedBuffer(DynamicCacheAlignedBuffer const &other):
+        _heapMem(new uint8_t[detail::cab::getNeededBackingBufferSize(other._alignedBufferCapacity, sizeof(DataT))])
+    {
+        this->_alignedBufferCapacity = other._alignedBufferCapacity;
+        this->_alignedBufferPtr = this->findCacheLineStart(_heapMem);
+        memcpy(this->_alignedBufferPtr, other._alignedBufferPtr, this->_alignedBufferCapacity * sizeof(DataT));
+    }
+
+    // Destructor
+    ~DynamicCacheAlignedBuffer()
+    {
+        delete[] this->_heapMem;
+    }
+
+    /**
+     * @brief Assign from other cache-aligned buffer with the same type.  A new buffer will be allocated
+     * of the correct size.
+     */
+    DynamicCacheAlignedBuffer &operator=(DynamicCacheAlignedBuffer const &other)
+    {
+        // Check for self assignment
+        if (&other == this) {
+            return *this;
+        }
+
+        delete[] _heapMem;
+        _heapMem = new uint8_t[detail::cab::getNeededBackingBufferSize(other._alignedBufferCapacity, sizeof(DataT))];
+        this->_alignedBufferPtr = this->findCacheLineStart(_heapMem);
+
+        memcpy(this->_alignedBufferPtr, other._alignedBufferPtr, this->_alignedBufferCapacity * sizeof(DataT));
+    }
+};
+
+/// @}
+
+}
+
+#endif //MBED_CACHEALIGNEDBUFFER_H
diff --git a/storage/blockdevice/COMPONENT_SD/include/SD/SDBlockDevice.h b/storage/blockdevice/COMPONENT_SD/include/SD/SDBlockDevice.h
index b1c7562596f..e5aafcf609f 100644
--- a/storage/blockdevice/COMPONENT_SD/include/SD/SDBlockDevice.h
+++ b/storage/blockdevice/COMPONENT_SD/include/SD/SDBlockDevice.h
@@ -57,6 +57,10 @@
  * Access an SD Card using SPI bus
  */
 class SDBlockDevice : public mbed::BlockDevice {
+
+    // Only HC block size is supported. Making this a static constant reduces code size.
+    static constexpr uint32_t _block_size = 512; /*!< Block size supported for SDHC card is 512 bytes  */
+
 public:
     /** Creates an SDBlockDevice on a SPI bus specified by pins (using dynamic pin-map)
      *
@@ -302,7 +306,6 @@ class SDBlockDevice : public mbed::BlockDevice {
     }
 
     rtos::Mutex _mutex;
-    static const uint32_t _block_size;
     uint32_t _erase_size;
     bool _is_initialized;
     bool _dbg;
@@ -314,6 +317,7 @@ class SDBlockDevice : public mbed::BlockDevice {
 
 #if DEVICE_SPI_ASYNCH
     bool _async_spi_enabled = false;
+    mbed::StaticCacheAlignedBuffer<uint8_t, _block_size> _async_data_buffer;
 #endif
 };
 
diff --git a/storage/blockdevice/COMPONENT_SD/source/SDBlockDevice.cpp b/storage/blockdevice/COMPONENT_SD/source/SDBlockDevice.cpp
index 381071b6803..73569fb6fbf 100644
--- a/storage/blockdevice/COMPONENT_SD/source/SDBlockDevice.cpp
+++ b/storage/blockdevice/COMPONENT_SD/source/SDBlockDevice.cpp
@@ -181,7 +181,6 @@ using namespace std::chrono;
 #define SD_BLOCK_DEVICE_ERROR_ERASE              -5010  /*!< Erase error: reset/sequence */
 #define SD_BLOCK_DEVICE_ERROR_WRITE              -5011  /*!< SPI Write error: !SPI_DATA_ACCEPTED */
 
-#define BLOCK_SIZE_HC                            512    /*!< Block size supported for SD card is 512 bytes  */
 #define WRITE_BL_PARTIAL                         0      /*!< Partial block write - Not supported */
 #define SPI_CMD(x) (0x40 | (x & 0x3f))
 
@@ -248,9 +247,6 @@ using namespace std::chrono;
 #define SPI_READ_ERROR_ECC_C     (0x1 << 2)  /*!< Card ECC failed */
 #define SPI_READ_ERROR_OFR       (0x1 << 3)  /*!< Out of Range */
 
-// Only HC block size is supported. Making this a static constant reduces code size.
-const uint32_t SDBlockDevice::_block_size = BLOCK_SIZE_HC;
-
 #if MBED_CONF_SD_CRC_ENABLED
 SDBlockDevice::SDBlockDevice(PinName mosi, PinName miso, PinName sclk, PinName cs, uint64_t hz, bool crc_on)
     : _sectors(0), _spi(mosi, miso, sclk, cs, use_gpio_ssel), _is_initialized(0),
@@ -269,7 +265,7 @@ SDBlockDevice::SDBlockDevice(PinName mosi, PinName miso, PinName sclk, PinName c
     _init_sck = MBED_CONF_SD_INIT_FREQUENCY;
     _transfer_sck = hz;
 
-    _erase_size = BLOCK_SIZE_HC;
+    _erase_size = _block_size;
 }
 
 #if MBED_CONF_SD_CRC_ENABLED
@@ -290,7 +286,7 @@ SDBlockDevice::SDBlockDevice(const spi_pinmap_t &spi_pinmap, PinName cs, uint64_
     _init_sck = MBED_CONF_SD_INIT_FREQUENCY;
     _transfer_sck = hz;
 
-    _erase_size = BLOCK_SIZE_HC;
+    _erase_size = _block_size;
 }
 
 SDBlockDevice::~SDBlockDevice()
@@ -925,7 +921,15 @@ int SDBlockDevice::_read_bytes(uint8_t *buffer, uint32_t length)
     // read data
 #if DEVICE_SPI_ASYNCH
     if (_async_spi_enabled) {
-        _spi.transfer_and_wait(nullptr, 0, buffer, length);
+        if (length > _async_data_buffer.capacity()) {
+            return SD_BLOCK_DEVICE_ERROR_PARAMETER;
+        }
+
+        // Do read into cache aligned buffer, then copy data into application buffer
+        if (_spi.transfer_and_wait(nullptr, 0, _async_data_buffer, length) != 0) {
+            return SD_BLOCK_DEVICE_ERROR_WRITE;
+        }
+        memcpy(buffer, _async_data_buffer.data(), length);
     } else
 #endif
     {
@@ -968,9 +972,15 @@ int SDBlockDevice::_read(uint8_t *buffer, uint32_t length)
     // read data
 #if DEVICE_SPI_ASYNCH
     if (_async_spi_enabled) {
-        if (_spi.transfer_and_wait(nullptr, 0, buffer, length) != 0) {
+        if (length > _async_data_buffer.capacity()) {
+            return SD_BLOCK_DEVICE_ERROR_PARAMETER;
+        }
+
+        // Do read into cache aligned buffer, then copy data into application buffer
+        if (_spi.transfer_and_wait(nullptr, 0, _async_data_buffer, length) != 0) {
             return SD_BLOCK_DEVICE_ERROR_WRITE;
         }
+        memcpy(buffer, _async_data_buffer.data(), length);
     } else
 #endif
     {
@@ -1067,7 +1077,13 @@ bd_size_t SDBlockDevice::_sd_sectors()
         debug_if(SD_DBG, "Didn't get a response from the disk\n");
         return 0;
     }
+#ifdef DEVICE_SPI_ASYNCH
+    StaticCacheAlignedBuffer<uint8_t, 16> csd_buffer;
+    uint8_t *csd = csd_buffer.data();
+#else
     uint8_t csd[16];
+#endif
+
     if (_read_bytes(csd, 16) != 0) {
         debug_if(SD_DBG, "Couldn't read csd response from disk\n");
         return 0;
@@ -1091,10 +1107,10 @@ bd_size_t SDBlockDevice::_sd_sectors()
 
             // ERASE_BLK_EN = 1: Erase in multiple of 512 bytes supported
             if (ext_bits(csd, 46, 46)) {
-                _erase_size = BLOCK_SIZE_HC;
+                _erase_size = _block_size;
             } else {
                 // ERASE_BLK_EN = 1: Erase in multiple of SECTOR_SIZE supported
-                _erase_size = BLOCK_SIZE_HC * (ext_bits(csd, 45, 39) + 1);
+                _erase_size = _block_size * (ext_bits(csd, 45, 39) + 1);
             }
             break;
 
@@ -1105,7 +1121,7 @@ bd_size_t SDBlockDevice::_sd_sectors()
             debug_if(SD_DBG, "Sectors: 0x%" PRIx64 "x : %" PRIu64 "\n", blocks, blocks);
             debug_if(SD_DBG, "Capacity: %" PRIu64 " MB\n", (blocks / (2048U)));
             // ERASE_BLK_EN is fixed to 1, which means host can erase one or multiple of 512 bytes.
-            _erase_size = BLOCK_SIZE_HC;
+            _erase_size = _block_size;
             break;
 
         default:
diff --git a/targets/TARGET_Cypress/TARGET_PSOC6/cy_spi_api.c b/targets/TARGET_Cypress/TARGET_PSOC6/cy_spi_api.c
index 6780ecff399..de40eed7301 100644
--- a/targets/TARGET_Cypress/TARGET_PSOC6/cy_spi_api.c
+++ b/targets/TARGET_Cypress/TARGET_PSOC6/cy_spi_api.c
@@ -234,7 +234,7 @@ const PinMap *spi_slave_cs_pinmap(void)
 
 #if DEVICE_SPI_ASYNCH
 
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, MBED_UNUSED uint8_t bit_width, uint32_t handler, uint32_t event, MBED_UNUSED DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, MBED_UNUSED uint8_t bit_width, uint32_t handler, uint32_t event, MBED_UNUSED DMAUsage hint)
 {
     struct spi_s *spi = cy_get_spi(obj);
     spi->async_handler = (void (*)(void))handler;
@@ -248,6 +248,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
         MBED_ERROR(MBED_MAKE_ERROR(MBED_MODULE_DRIVER_SPI, MBED_ERROR_CODE_FAILED_OPERATION), "cyhal_spi_transfer_async");
     }
     spi->async_in_progress = true;
+
+    return false; // Currently we always use interrupts, not DMA
 }
 
 uint32_t spi_irq_handler_asynch(spi_t *obj)
diff --git a/targets/TARGET_Freescale/TARGET_MCUXpresso_MCUS/TARGET_MCU_K64F/spi_api.c b/targets/TARGET_Freescale/TARGET_MCUXpresso_MCUS/TARGET_MCU_K64F/spi_api.c
index a35f0a9cdc0..1ac6a1856bb 100644
--- a/targets/TARGET_Freescale/TARGET_MCUXpresso_MCUS/TARGET_MCU_K64F/spi_api.c
+++ b/targets/TARGET_Freescale/TARGET_MCUXpresso_MCUS/TARGET_MCU_K64F/spi_api.c
@@ -358,7 +358,7 @@ static void spi_buffer_set(spi_t *obj, const void *tx, uint32_t tx_length, void
     obj->rx_buff.width = bit_width;
 }
 
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     if (spi_active(obj)) {
         return;
@@ -400,6 +400,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
         // Can't enter deep sleep as long as SPI transfer is active
         sleep_manager_lock_deep_sleep();
     }
+
+    return hint != DMA_USAGE_NEVER; // DMA will be used as long as the hint is not NEVER
 }
 
 uint32_t spi_irq_handler_asynch(spi_t *obj)
diff --git a/targets/TARGET_NORDIC/TARGET_NRF5x/TARGET_NRF52/spi_api.c b/targets/TARGET_NORDIC/TARGET_NRF5x/TARGET_NRF52/spi_api.c
index 3cd900edca1..56b2600ed9b 100644
--- a/targets/TARGET_NORDIC/TARGET_NRF5x/TARGET_NRF52/spi_api.c
+++ b/targets/TARGET_NORDIC/TARGET_NRF5x/TARGET_NRF52/spi_api.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2017 Nordic Semiconductor ASA
  * All rights reserved.
+ * SPDX-License-Identifier: LicenseRef-Nordic-5-Clause
  *
  * Redistribution and use in source and binary forms, with or without modification,
  * are permitted provided that the following conditions are met:
@@ -882,8 +883,10 @@ static void nordic_nrf5_spi_event_handler(nrfx_spi_evt_t const *p_event, void *p
  * Parameter  event     The logical OR of events to be registered
  * Parameter  handler   SPI interrupt handler
  * Parameter  hint      A suggestion for how to use DMA with this transfer
+ *
+ * Returns    bool      True if DMA was used for the transfer, false otherwise
  */
-void spi_master_transfer(spi_t *obj,
+bool spi_master_transfer(spi_t *obj,
                          const void *tx,
                          size_t tx_length,
                          void *rx,
@@ -949,6 +952,12 @@ void spi_master_transfer(spi_t *obj,
         /* Signal callback handler. */
         callback();
     }
+
+#if NRFX_CHECK(NRFX_SPIM_ENABLED)
+    return true; // We always use DMA when the SPIM preripheral is available
+#else
+    return false; // We always use interrupts when the SPI peripheral is available
+#endif
 }
 
 /** The asynchronous IRQ handler
diff --git a/targets/TARGET_NUVOTON/TARGET_M2354/spi_api.c b/targets/TARGET_NUVOTON/TARGET_M2354/spi_api.c
index 3e6342935aa..3a8cd3f1276 100644
--- a/targets/TARGET_NUVOTON/TARGET_M2354/spi_api.c
+++ b/targets/TARGET_NUVOTON/TARGET_M2354/spi_api.c
@@ -445,7 +445,7 @@ void spi_slave_write(spi_t *obj, int value)
 #endif
 
 #if DEVICE_SPI_ASYNCH
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     SPI_T *spi_base = (SPI_T *) NU_MODBASE(obj->spi.spi);
     SPI_SET_DATA_WIDTH(spi_base, bit_width);
@@ -573,6 +573,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
 
         /* Don't enable SPI TX/RX threshold interrupts as commented above */
     }
+
+    return obj->spi.dma_usage != DMA_USAGE_NEVER;
 }
 
 /**
diff --git a/targets/TARGET_NUVOTON/TARGET_M251/spi_api.c b/targets/TARGET_NUVOTON/TARGET_M251/spi_api.c
index d8e9d378aca..25eeb2d1c32 100644
--- a/targets/TARGET_NUVOTON/TARGET_M251/spi_api.c
+++ b/targets/TARGET_NUVOTON/TARGET_M251/spi_api.c
@@ -369,7 +369,7 @@ void spi_slave_write(spi_t *obj, int value)
 #endif
 
 #if DEVICE_SPI_ASYNCH
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     SPI_T *spi_base = (SPI_T *) NU_MODBASE(obj->spi.spi);
     SPI_SET_DATA_WIDTH(spi_base, bit_width);
@@ -497,6 +497,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
 
         /* Don't enable SPI TX/RX threshold interrupts as commented above */
     }
+
+    return obj->spi.dma_usage != DMA_USAGE_NEVER;
 }
 
 /**
diff --git a/targets/TARGET_NUVOTON/TARGET_M261/spi_api.c b/targets/TARGET_NUVOTON/TARGET_M261/spi_api.c
index ebf7da50f25..313781728b7 100644
--- a/targets/TARGET_NUVOTON/TARGET_M261/spi_api.c
+++ b/targets/TARGET_NUVOTON/TARGET_M261/spi_api.c
@@ -390,7 +390,7 @@ void spi_slave_write(spi_t *obj, int value)
 #endif
 
 #if DEVICE_SPI_ASYNCH
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     SPI_T *spi_base = (SPI_T *) NU_MODBASE(obj->spi.spi);
     SPI_SET_DATA_WIDTH(spi_base, bit_width);
@@ -518,6 +518,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
 
         /* Don't enable SPI TX/RX threshold interrupts as commented above */
     }
+
+    return obj->spi.dma_usage != DMA_USAGE_NEVER;
 }
 
 /**
diff --git a/targets/TARGET_NUVOTON/TARGET_M451/spi_api.c b/targets/TARGET_NUVOTON/TARGET_M451/spi_api.c
index 8b2de0ed2e4..a43b8756960 100644
--- a/targets/TARGET_NUVOTON/TARGET_M451/spi_api.c
+++ b/targets/TARGET_NUVOTON/TARGET_M451/spi_api.c
@@ -1,5 +1,6 @@
 /* mbed Microcontroller Library
  * Copyright (c) 2015-2016 Nuvoton
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -383,7 +384,7 @@ void spi_slave_write(spi_t *obj, int value)
 #endif
 
 #if DEVICE_SPI_ASYNCH
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     SPI_T *spi_base = (SPI_T *) NU_MODBASE(obj->spi.spi);
     SPI_SET_DATA_WIDTH(spi_base, bit_width);
@@ -501,6 +502,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
 
         /* Don't enable SPI TX/RX threshold interrupts as commented above */
     }
+
+    return obj->spi.dma_usage != DMA_USAGE_NEVER;
 }
 
 /**
diff --git a/targets/TARGET_NUVOTON/TARGET_M460/spi_api.c b/targets/TARGET_NUVOTON/TARGET_M460/spi_api.c
index c9f7d4802e8..126bde64428 100644
--- a/targets/TARGET_NUVOTON/TARGET_M460/spi_api.c
+++ b/targets/TARGET_NUVOTON/TARGET_M460/spi_api.c
@@ -488,7 +488,7 @@ void spi_slave_write(spi_t *obj, int value)
 #endif
 
 #if DEVICE_SPI_ASYNCH
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     SPI_T *spi_base = (SPI_T *) NU_MODBASE(obj->spi.spi);
     SPI_SET_DATA_WIDTH(spi_base, bit_width);
@@ -616,6 +616,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
 
         /* Don't enable SPI TX/RX threshold interrupts as commented above */
     }
+
+    return obj->spi.dma_usage != DMA_USAGE_NEVER;
 }
 
 /**
diff --git a/targets/TARGET_NUVOTON/TARGET_M480/spi_api.c b/targets/TARGET_NUVOTON/TARGET_M480/spi_api.c
index ca5cf79e8b4..e3d2b552e11 100644
--- a/targets/TARGET_NUVOTON/TARGET_M480/spi_api.c
+++ b/targets/TARGET_NUVOTON/TARGET_M480/spi_api.c
@@ -439,7 +439,7 @@ void spi_slave_write(spi_t *obj, int value)
 #endif
 
 #if DEVICE_SPI_ASYNCH
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     SPI_T *spi_base = (SPI_T *) NU_MODBASE(obj->spi.spi);
     SPI_SET_DATA_WIDTH(spi_base, bit_width);
@@ -567,6 +567,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
 
         /* Don't enable SPI TX/RX threshold interrupts as commented above */
     }
+
+    return obj->spi.dma_usage != DMA_USAGE_NEVER;
 }
 
 /**
diff --git a/targets/TARGET_NUVOTON/TARGET_NANO100/spi_api.c b/targets/TARGET_NUVOTON/TARGET_NANO100/spi_api.c
index 3862edf5162..fe8c4e79e61 100644
--- a/targets/TARGET_NUVOTON/TARGET_NANO100/spi_api.c
+++ b/targets/TARGET_NUVOTON/TARGET_NANO100/spi_api.c
@@ -1,5 +1,6 @@
 /* mbed Microcontroller Library
  * Copyright (c) 2015-2017 Nuvoton
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -434,7 +435,7 @@ void spi_slave_write(spi_t *obj, int value)
 #endif
 
 #if DEVICE_SPI_ASYNCH
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     SPI_T *spi_base = (SPI_T *) NU_MODBASE(obj->spi.spi);
     SPI_SET_DATA_WIDTH(spi_base, bit_width);
@@ -549,6 +550,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
         PDMA_Trigger(obj->spi.dma_chn_id_rx);
         PDMA_Trigger(obj->spi.dma_chn_id_tx);
     }
+
+    return obj->spi.dma_usage != DMA_USAGE_NEVER;
 }
 
 /**
diff --git a/targets/TARGET_NUVOTON/TARGET_NUC472/spi_api.c b/targets/TARGET_NUVOTON/TARGET_NUC472/spi_api.c
index ddcf800de57..6ce546a7967 100644
--- a/targets/TARGET_NUVOTON/TARGET_NUC472/spi_api.c
+++ b/targets/TARGET_NUVOTON/TARGET_NUC472/spi_api.c
@@ -1,5 +1,6 @@
 /* mbed Microcontroller Library
  * Copyright (c) 2015-2016 Nuvoton
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -389,7 +390,7 @@ void spi_slave_write(spi_t *obj, int value)
 #endif
 
 #if DEVICE_SPI_ASYNCH
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     SPI_T *spi_base = (SPI_T *) NU_MODBASE(obj->spi.spi);
     SPI_SET_DATA_WIDTH(spi_base, bit_width);
@@ -503,6 +504,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
 
         /* Don't enable SPI TX/RX threshold interrupts as commented above */
     }
+
+    return obj->spi.dma_usage != DMA_USAGE_NEVER;
 }
 
 /**
diff --git a/targets/TARGET_RENESAS/TARGET_RZ_A1XX/spi_api.c b/targets/TARGET_RENESAS/TARGET_RZ_A1XX/spi_api.c
index 294de6efba6..0b6a09c17b4 100644
--- a/targets/TARGET_RENESAS/TARGET_RZ_A1XX/spi_api.c
+++ b/targets/TARGET_RENESAS/TARGET_RZ_A1XX/spi_api.c
@@ -518,7 +518,7 @@ static void spi_async_read(spi_t *obj)
  * ASYNCHRONOUS HAL
  ******************************************************************************/
 
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     int i;
     MBED_ASSERT(obj);
@@ -556,6 +556,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
     spi_irqs_set(obj, 1);
     
     spi_async_write(obj);
+
+    return false; // Currently we always use interrupts, not DMA
 }
 
 uint32_t spi_irq_handler_asynch(spi_t *obj)
diff --git a/targets/TARGET_RENESAS/TARGET_RZ_A2XX/spi_api.c b/targets/TARGET_RENESAS/TARGET_RZ_A2XX/spi_api.c
index 295ae9c4876..4e402514e45 100644
--- a/targets/TARGET_RENESAS/TARGET_RZ_A2XX/spi_api.c
+++ b/targets/TARGET_RENESAS/TARGET_RZ_A2XX/spi_api.c
@@ -526,7 +526,7 @@ static void spi_async_read(spi_t *obj)
  * ASYNCHRONOUS HAL
  ******************************************************************************/
 
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     int i;
     MBED_ASSERT(obj);
@@ -564,6 +564,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
     spi_irqs_set(obj, 1);
 
     spi_async_write(obj);
+
+    return false; // Currently we always use interrupts, not DMA
 }
 
 uint32_t spi_irq_handler_asynch(spi_t *obj)
diff --git a/targets/TARGET_STM/TARGET_STM32H7/STM32Cube_FW/STM32H7xx_HAL_Driver/stm32h7xx_hal_dma_ex.c b/targets/TARGET_STM/TARGET_STM32H7/STM32Cube_FW/STM32H7xx_HAL_Driver/stm32h7xx_hal_dma_ex.c
index e55c3d54ceb..c5f2ef7d9d2 100644
--- a/targets/TARGET_STM/TARGET_STM32H7/STM32Cube_FW/STM32H7xx_HAL_Driver/stm32h7xx_hal_dma_ex.c
+++ b/targets/TARGET_STM/TARGET_STM32H7/STM32Cube_FW/STM32H7xx_HAL_Driver/stm32h7xx_hal_dma_ex.c
@@ -290,7 +290,18 @@ HAL_StatusTypeDef HAL_DMAEx_MultiBufferStart_IT(DMA_HandleTypeDef *hdma, uint32_
     {
       /* Enable Common interrupts*/
       MODIFY_REG(((DMA_Stream_TypeDef   *)hdma->Instance)->CR, (DMA_IT_TC | DMA_IT_TE | DMA_IT_DME | DMA_IT_HT), (DMA_IT_TC | DMA_IT_TE | DMA_IT_DME));
-      ((DMA_Stream_TypeDef   *)hdma->Instance)->FCR |= DMA_IT_FE;
+
+      /* Mbed CE mod: Only enable the FIFO Error interrupt if the FIFO is actually enabled.
+       * If it's not enabled, then this interrupt can trigger spuriously from memory bus
+       * stalls that the DMA engine encounters, and this creates random DMA failures.
+       * Reference forum thread here:
+       * https://community.st.com/t5/stm32-mcus-products/spi-dma-fifo-error-issue-feifx/td-p/537074
+       * also: https://community.st.com/t5/stm32-mcus-touch-gfx-and-gui/spi-dma-error-is-occurred-when-the-other-dma-memory-to-memory-is/td-p/191590
+       */
+      if(((DMA_Stream_TypeDef *)hdma->Instance)->FCR & DMA_SxFCR_DMDIS)
+      {
+        ((DMA_Stream_TypeDef *)hdma->Instance)->FCR |= DMA_IT_FE;
+      }
 
       if((hdma->XferHalfCpltCallback != NULL) || (hdma->XferM1HalfCpltCallback != NULL))
       {
diff --git a/targets/TARGET_STM/stm_spi_api.c b/targets/TARGET_STM/stm_spi_api.c
index f444afeb1e6..d5c49fbcd08 100644
--- a/targets/TARGET_STM/stm_spi_api.c
+++ b/targets/TARGET_STM/stm_spi_api.c
@@ -2,6 +2,7 @@
  *******************************************************************************
  * Copyright (c) 2015, STMicroelectronics
  * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -1410,13 +1411,15 @@ int spi_master_block_write(spi_t *obj, const char *tx_buffer, int tx_length,
     int total = (tx_length > rx_length) ? tx_length : rx_length;
 
     if (handle->Init.Direction == SPI_DIRECTION_2LINES) {
+        const int word_size = 0x01 << bitshift;
+
         int write_fill_frame = write_fill;
         /* extend fill symbols for 16/32 bit modes */
-        for (int i = 0; i < bitshift; i++) {
+        for (int i = 0; i < word_size; i++) {
             write_fill_frame = (write_fill_frame << 8) | write_fill;
         }
 
-        const int word_size = 0x01 << bitshift;
+
         for (int i = 0; i < total; i += word_size) {
             int out = (i < tx_length) ? spi_get_word_from_buffer(tx_buffer + i, bitshift) : write_fill_frame;
             int in = spi_master_write(obj, out);
@@ -1534,8 +1537,8 @@ typedef enum {
 } transfer_type_t;
 
 
-/// @returns the number of bytes transferred, or `0` if nothing transferred
-static int spi_master_start_asynch_transfer(spi_t *obj, transfer_type_t transfer_type, const void *tx, void *rx, size_t length, DMAUsage hint)
+/// @returns True if DMA was used, false otherwise
+static bool spi_master_start_asynch_transfer(spi_t *obj, transfer_type_t transfer_type, const void *tx, void *rx, size_t length, DMAUsage hint)
 {
     struct spi_s *spiobj = SPI_S(obj);
     SPI_HandleTypeDef *handle = &(spiobj->handle);
@@ -1547,7 +1550,6 @@ static int spi_master_start_asynch_transfer(spi_t *obj, transfer_type_t transfer
     size_t words;
 
     obj->spi.transfer_type = transfer_type;
-
     words = length >> bitshift;
 
     bool useDMA = false;
@@ -1579,6 +1581,8 @@ static int spi_master_start_asynch_transfer(spi_t *obj, transfer_type_t transfer
 
         useDMA = true;
     }
+
+    obj->spi.curr_transfer_uses_dma = useDMA;
 #endif
 
     DEBUG_PRINTF("SPI inst=0x%8X Start: type=%u, length=%u, DMA=%d\r\n", (int) handle->Instance, transfer_type, length, !!useDMA);
@@ -1590,15 +1594,6 @@ static int spi_master_start_asynch_transfer(spi_t *obj, transfer_type_t transfer
     NVIC_SetPriority(irq_n, 1);
     NVIC_EnableIRQ(irq_n);
 
-#if defined(STM32_SPI_CAPABILITY_DMA) && defined(__DCACHE_PRESENT)
-    if (useDMA && transfer_type != SPI_TRANSFER_TYPE_RX)
-    {
-        // For chips with a cache (e.g. Cortex-M7), we need to evict the Tx data from cache to main memory.
-        // This ensures that the DMA controller can see the most up-to-date copy of the data.
-        SCB_CleanDCache_by_Addr((volatile void*)tx, length);
-    }
-#endif
-
     // flush FIFO
 #if defined(SPI_FLAG_FRLVL)
     HAL_SPIEx_FlushRxFifo(handle);
@@ -1635,7 +1630,7 @@ static int spi_master_start_asynch_transfer(spi_t *obj, transfer_type_t transfer
 
             if (useDMA) {
 #if defined(STM32_SPI_CAPABILITY_DMA) && defined(__DCACHE_PRESENT)
-                // For chips with a cache (e.g. Cortex-M7), we need to evict the Tx data from cache to main memory.
+                // For chips with a cache (e.g. Cortex-M7), we need to evict the Tx fill data from cache to main memory.
                 // This ensures that the DMA controller can see the most up-to-date copy of the data.
                 SCB_CleanDCache_by_Addr(rx, length);
 #endif
@@ -1650,15 +1645,6 @@ static int spi_master_start_asynch_transfer(spi_t *obj, transfer_type_t transfer
             length = 0;
     }
 
-#if defined(STM32_SPI_CAPABILITY_DMA) && defined(__DCACHE_PRESENT)
-    if (useDMA && transfer_type != SPI_TRANSFER_TYPE_TX)
-    {
-        // For chips with a cache (e.g. Cortex-M7), we need to invalidate the Rx data in cache.
-        // This ensures that the CPU will fetch the data from SRAM instead of using its cache.
-        SCB_InvalidateDCache_by_Addr(rx, length);
-    }
-#endif
-
     if (rc) {
 #if defined(SPI_IP_VERSION_V2)
         // enable SPI back in case of error
@@ -1667,14 +1653,13 @@ static int spi_master_start_asynch_transfer(spi_t *obj, transfer_type_t transfer
         }
 #endif
         DEBUG_PRINTF("SPI: RC=%u\n", rc);
-        length = 0;
     }
 
-    return length;
+    return useDMA;
 }
 
 // asynchronous API
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     struct spi_s *spiobj = SPI_S(obj);
     SPI_HandleTypeDef *handle = &(spiobj->handle);
@@ -1687,7 +1672,7 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
 
     // don't do anything, if the buffers aren't valid
     if (!use_tx && !use_rx) {
-        return;
+        return false;
     }
 
     // copy the buffers to the SPI object
@@ -1717,11 +1702,14 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
             obj->tx_buff.length = size;
             obj->rx_buff.length = size;
         }
-        spi_master_start_asynch_transfer(obj, SPI_TRANSFER_TYPE_TXRX, tx, rx, size, hint);
+        return spi_master_start_asynch_transfer(obj, SPI_TRANSFER_TYPE_TXRX, tx, rx, size, hint);
     } else if (use_tx) {
-        spi_master_start_asynch_transfer(obj, SPI_TRANSFER_TYPE_TX, tx, NULL, tx_length, hint);
+        return spi_master_start_asynch_transfer(obj, SPI_TRANSFER_TYPE_TX, tx, NULL, tx_length, hint);
     } else if (use_rx) {
-        spi_master_start_asynch_transfer(obj, SPI_TRANSFER_TYPE_RX, NULL, rx, rx_length, hint);
+        return spi_master_start_asynch_transfer(obj, SPI_TRANSFER_TYPE_RX, NULL, rx, rx_length, hint);
+    }
+    else {
+        return false;
     }
 }
 
@@ -1830,21 +1818,10 @@ void spi_abort_asynch(spi_t *obj)
     NVIC_ClearPendingIRQ(irq_n);
     NVIC_DisableIRQ(irq_n);
 
-#ifdef STM32_SPI_CAPABILITY_DMA
-    // Abort DMA transfers if DMA channels have been allocated
-    if(spiobj->txDMAInitialized) {
-        HAL_DMA_Abort_IT(spiobj->handle.hdmatx);
-    }
-    if(spiobj->rxDMAInitialized) {
-        HAL_DMA_Abort_IT(spiobj->handle.hdmarx);
-    }
-#endif
-
-    // clean-up
-    LL_SPI_Disable(SPI_INST(obj));
-    HAL_SPI_DeInit(handle);
-
-    init_spi(obj);
+    // Use HAL abort function.
+    // Conveniently, this is smart enough to automatically abort the DMA transfer
+    // if DMA was used.
+    HAL_SPI_Abort_IT(handle);
 }
 
 #endif //DEVICE_SPI_ASYNCH
diff --git a/targets/TARGET_STM/stm_spi_api.h b/targets/TARGET_STM/stm_spi_api.h
index 6fb22c93c4e..f34f7d17ef7 100644
--- a/targets/TARGET_STM/stm_spi_api.h
+++ b/targets/TARGET_STM/stm_spi_api.h
@@ -36,6 +36,7 @@ struct spi_s {
 #if DEVICE_SPI_ASYNCH
     uint32_t event;
     uint8_t transfer_type;
+    bool curr_transfer_uses_dma;
 
     // Callback function for when we get an interrupt on an async transfer.
     // This will point, through a bit of indirection, to SPI::irq_handler_asynch()
diff --git a/targets/TARGET_Silicon_Labs/TARGET_EFM32/spi_api.c b/targets/TARGET_Silicon_Labs/TARGET_EFM32/spi_api.c
index c7a2b76d09d..0a852b5f9cb 100644
--- a/targets/TARGET_Silicon_Labs/TARGET_EFM32/spi_api.c
+++ b/targets/TARGET_Silicon_Labs/TARGET_EFM32/spi_api.c
@@ -1116,8 +1116,10 @@ static void spi_activate_dma(spi_t *obj, void* rxdata, const void* txdata, int t
 *       * ALWAYS: use DMA if channels are available, and hold on to the channels after the transfer.
 *                 If the previous transfer has kept the channel, that channel will continue to get used.
 *
+*  Returns true if DMA was used, false otherwise.
+*
 ********************************************************************/
-void spi_master_transfer_dma(spi_t *obj, const void *txdata, void *rxdata, int tx_length, int rx_length, void* cb, DMAUsage hint)
+bool spi_master_transfer_dma(spi_t *obj, const void *txdata, void *rxdata, int tx_length, int rx_length, void* cb, DMAUsage hint)
 {
     /* Init DMA here to include it in the power figure */
     dma_init();
@@ -1128,11 +1130,13 @@ void spi_master_transfer_dma(spi_t *obj, const void *txdata, void *rxdata, int t
     if (hint != DMA_USAGE_NEVER && obj->spi.dmaOptionsTX.dmaUsageState == DMA_USAGE_ALLOCATED) {
         /* setup has already been done, so just activate the transfer */
         spi_activate_dma(obj, rxdata, txdata, tx_length, rx_length);
+        return true;
     } else if (hint == DMA_USAGE_NEVER) {
         /* use IRQ */
         obj->spi.spi->IFC = 0xFFFFFFFF;
         spi_master_write_asynch(obj);
         spi_enable_interrupt(obj, (uint32_t)cb, true);
+        return false;
     } else {
         /* try to acquire channels */
         dma_init();
@@ -1147,11 +1151,13 @@ void spi_master_transfer_dma(spi_t *obj, const void *txdata, void *rxdata, int t
             spi_master_dma_channel_setup(obj, cb);
             /* and activate the transfer */
             spi_activate_dma(obj, rxdata, txdata, tx_length, rx_length);
+            return true;
         } else {
             /* DMA is unavailable, so fall back to IRQ */
             obj->spi.spi->IFC = 0xFFFFFFFF;
             spi_master_write_asynch(obj);
             spi_enable_interrupt(obj, (uint32_t)cb, true);
+            return false;
         }
     }
 }
@@ -1167,8 +1173,11 @@ void spi_master_transfer_dma(spi_t *obj, const void *txdata, void *rxdata, int t
  * @param[in] event     The logical OR of events to be registered
  * @param[in] handler   SPI interrupt handler
  * @param[in] hint      A suggestion for how to use DMA with this transfer
+ *
+ * @return True if DMA was actually used for the transfer, false otherwise (if interrupts or another CPU-based
+ * method is used to do the transfer).
  */
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     if( spi_active(obj) ) return;
 
@@ -1190,7 +1199,7 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
     spi_enable_event(obj, event, true);
 
     /* And kick off the transfer */
-    spi_master_transfer_dma(obj, tx, rx, tx_length, rx_length, (void*)handler, hint);
+    return spi_master_transfer_dma(obj, tx, rx, tx_length, rx_length, (void*)handler, hint);
 }
 
 
diff --git a/targets/TARGET_TOSHIBA/TARGET_TMPM46B/spi_api.c b/targets/TARGET_TOSHIBA/TARGET_TMPM46B/spi_api.c
index bd98132b0b1..9d8b846e582 100644
--- a/targets/TARGET_TOSHIBA/TARGET_TMPM46B/spi_api.c
+++ b/targets/TARGET_TOSHIBA/TARGET_TMPM46B/spi_api.c
@@ -2,6 +2,7 @@
  *******************************************************************************
  * (C)Copyright TOSHIBA ELECTRONIC DEVICES & STORAGE CORPORATION 2017 All rights reserved
  * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -409,7 +410,7 @@ const PinMap *spi_slave_cs_pinmap()
 
 #ifdef DEVICE_SPI_ASYNCH
 
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width,
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width,
                          uint32_t handler, uint32_t event, DMAUsage hint)
 {
     struct spi_s *obj_s = SPI_S(obj);
@@ -455,6 +456,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
 
     SSP_Enable(spi);
     NVIC_EnableIRQ(obj_s->irqn);
+
+    return false; // Currently we always use interrupts, not DMA
 }
 
 uint32_t spi_irq_handler_asynch(spi_t *obj)
diff --git a/targets/TARGET_TOSHIBA/TARGET_TMPM4G9/spi_api.c b/targets/TARGET_TOSHIBA/TARGET_TMPM4G9/spi_api.c
index f8ebe48fa29..3e3cfe722e8 100644
--- a/targets/TARGET_TOSHIBA/TARGET_TMPM4G9/spi_api.c
+++ b/targets/TARGET_TOSHIBA/TARGET_TMPM4G9/spi_api.c
@@ -484,7 +484,7 @@ const PinMap *spi_slave_cs_pinmap()
 
 #ifdef DEVICE_SPI_ASYNCH
 
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     struct spi_s *obj_s = SPI_S(obj);
     tspi_t *p_obj = &(obj_s->p_obj);
@@ -535,6 +535,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
         NVIC_EnableIRQ(obj_s->rxirqn);
     }
     NVIC_EnableIRQ(obj_s->errirqn);
+
+    return false; // Currently we always use interrupts, not DMA
 }
 
 uint32_t spi_irq_handler_asynch(spi_t *obj)
diff --git a/targets/TARGET_TOSHIBA/TARGET_TMPM4GR/spi_api.c b/targets/TARGET_TOSHIBA/TARGET_TMPM4GR/spi_api.c
index 7ebaef77823..1a997914003 100644
--- a/targets/TARGET_TOSHIBA/TARGET_TMPM4GR/spi_api.c
+++ b/targets/TARGET_TOSHIBA/TARGET_TMPM4GR/spi_api.c
@@ -517,7 +517,7 @@ const PinMap *spi_slave_cs_pinmap()
 
 #if DEVICE_SPI_ASYNCH
 
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width,
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width,
                          uint32_t handler, uint32_t event, DMAUsage hint)
 {
     struct spi_s *spiobj = SPI_S(obj);
@@ -574,6 +574,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
     NVIC_EnableIRQ(p_irqn->Error);
     NVIC_EnableIRQ(p_irqn->Tx);
     NVIC_EnableIRQ(p_irqn->Rx);
+
+    return false; // Currently we always use interrupts, not DMA
 }
 uint32_t spi_irq_handler_asynch(spi_t *obj)
 {
diff --git a/targets/TARGET_TOSHIBA/TARGET_TMPM4NR/spi_api.c b/targets/TARGET_TOSHIBA/TARGET_TMPM4NR/spi_api.c
index dc958ba927e..f288f5b3987 100644
--- a/targets/TARGET_TOSHIBA/TARGET_TMPM4NR/spi_api.c
+++ b/targets/TARGET_TOSHIBA/TARGET_TMPM4NR/spi_api.c
@@ -398,7 +398,7 @@ const PinMap *spi_slave_cs_pinmap()
 
 #if DEVICE_SPI_ASYNCH
 
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width,
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width,
                          uint32_t handler, uint32_t event, DMAUsage hint)
 {
     struct spi_s *spiobj = SPI_S(obj);
@@ -455,6 +455,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
     NVIC_EnableIRQ(p_irqn->Error);
     NVIC_EnableIRQ(p_irqn->Tx);
     NVIC_EnableIRQ(p_irqn->Rx);
+
+    return false; // Currently we always use interrupts, not DMA
 }
 uint32_t spi_irq_handler_asynch(spi_t *obj)
 {
diff --git a/tools/python/scancode_evaluate/scancode_evaluate.py b/tools/python/scancode_evaluate/scancode_evaluate.py
index 3cf24457f9d..3e1ebeb91df 100644
--- a/tools/python/scancode_evaluate/scancode_evaluate.py
+++ b/tools/python/scancode_evaluate/scancode_evaluate.py
@@ -72,14 +72,30 @@ def has_spdx_text_in_scancode_output(scancode_output_data_file_licenses):
     )
 
 
+SPDX_LICENSE_REGEX = re.compile(r"SPDX-License-Identifier: *(\S+)")
+
+
 def has_spdx_text_in_analysed_file(scanned_file_content):
     """Returns true if the file analysed by ScanCode contains SPDX identifier."""
-    return bool(re.findall("SPDX-License-Identifier:?", scanned_file_content))
+    return bool(SPDX_LICENSE_REGEX.findall(scanned_file_content))
 
-
-def has_binary_license(scanned_file_content):
-    """Returns true if the file analysed by ScanCode contains a Permissive Binary License."""
-    return bool(re.findall("Permissive Binary License", scanned_file_content))
+def has_exempted_spdx_identifier(scanned_file_content):
+    """
+    Returns true if the file analysed by scancode contains an exempted SPDX identifier.
+    (this is used for licenses like Nordic BSD which are not technically permissive licenses according
+    to SPDX but which are still OK for us)
+    """
+    spdx_find_result = SPDX_LICENSE_REGEX.findall(scanned_file_content)
+    if len(spdx_find_result) == 1:
+        if spdx_find_result[0] == "LicenseRef-Nordic-5-Clause":
+            return True
+        if spdx_find_result[0] == "LicenseRef-PBL":
+            return True
+        else:
+            return False
+    else:
+        # Either 0 or multiple matches, ignore
+        return False
 
 
 def get_file_text(scancode_output_data_file):
@@ -94,6 +110,7 @@ def get_file_text(scancode_output_data_file):
     except UnicodeDecodeError:
         userlog.warning("Unable to decode file text in: %s" % file_path)
         # Ignore files that cannot be decoded
+        return None
 
 
 def license_check(scancode_output_path):
@@ -135,7 +152,10 @@ def license_check(scancode_output_path):
 
         if not has_permissive_text_in_scancode_output(scancode_output_data_file['licenses']):
             scanned_file_content = get_file_text(scancode_output_data_file)
-            if not (scanned_file_content and has_binary_license(scanned_file_content)):
+            if (scanned_file_content is None
+                or has_exempted_spdx_identifier(scanned_file_content)):
+                continue
+            else:
                 scancode_output_data_file['fail_reason'] = MISSING_PERMISSIVE_LICENSE_TEXT
                 license_offenders.append(scancode_output_data_file)