Skip to content

Commit

Permalink
Handle the unknown d_type field in posix readdir
Browse files Browse the repository at this point in the history
  • Loading branch information
adithyaov committed Dec 10, 2024
1 parent 2761290 commit a0dbf32
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 25 deletions.
15 changes: 10 additions & 5 deletions core/src/Streamly/Internal/FileSystem/DirIO.hs
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ import Streamly.Internal.FileSystem.Windows.ReadDir
#else
import Streamly.Internal.FileSystem.Posix.ReadDir
( DirStream, openDirStream, closeDirStream, readDirStreamEither
, readEitherChunks)
, readEitherChunks, PathClassified, evaluateUnknown, unClassifyPath)
#endif
import qualified Streamly.Internal.Data.Stream as S
import qualified Streamly.Data.Unfold as UF
Expand Down Expand Up @@ -238,19 +238,19 @@ toStreamWithBufferOf chunkSize h = AS.concat $ toChunksWithBufferOf chunkSize h

{-# INLINE streamEitherReader #-}
streamEitherReader :: MonadIO m =>
Unfold m DirStream (Either Path Path)
Unfold m DirStream PathClassified
streamEitherReader = Unfold step return
where

step strm = do
r <- liftIO $ readDirStreamEither strm
case r of
Nothing -> return Stop
Just x -> return $ Yield x strm
Just (x) -> return $ Yield x strm

{-# INLINE streamReader #-}
streamReader :: MonadIO m => Unfold m DirStream Path
streamReader = fmap (either id id) streamEitherReader
streamReader = fmap unClassifyPath streamEitherReader

-- | Read a directory emitting a stream with names of the children. Filter out
-- "." and ".." entries.
Expand Down Expand Up @@ -283,7 +283,12 @@ eitherReader =
-- XXX The measured overhead of bracketIO is not noticeable, if it turns
-- out to be a problem for small filenames we can use getdents64 to use
-- chunked read to avoid the overhead.
UF.bracketIO openDirStream closeDirStream streamEitherReader
UF.bracketIO
(\parent -> (parent,) <$> openDirStream parent)
(\(_, dirStream) -> closeDirStream dirStream)
(UF.mapM2
(\(parent, _) p -> liftIO (evaluateUnknown parent p))
(UF.lmap snd streamEitherReader))

{-# INLINE eitherReaderPaths #-}
eitherReaderPaths ::(MonadIO m, MonadCatch m) =>
Expand Down
16 changes: 16 additions & 0 deletions core/src/Streamly/Internal/FileSystem/Posix/ReadDir.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#include <sys/stat.h>

int stat_is_directory(const char *path) {
struct stat statbuf;

// Call stat to get the file status
if (stat(path, &statbuf) == 0) {
// Check if the file is a directory using S_ISDIR macro
if (S_ISDIR(statbuf.st_mode)) {
return 1; // It is a directory
} else {
return 0; // Not a directory
}
}
return -1; // An error occurred (stat failed)
}
83 changes: 66 additions & 17 deletions core/src/Streamly/Internal/FileSystem/Posix/ReadDir.hsc
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ module Streamly.Internal.FileSystem.Posix.ReadDir
(
#if !defined(mingw32_HOST_OS) && !defined(__MINGW32__)
DirStream
, PathClassified(..)
, unClassifyPath
, evaluateUnknown
, openDirStream
, closeDirStream
, readDirStreamEither
Expand Down Expand Up @@ -80,6 +83,15 @@ data {-# CTYPE "struct dirent" #-} CDirent

newtype DirStream = DirStream (Ptr CDir)

-------------------------------------------------------------------------------
-- Stat
-------------------------------------------------------------------------------

foreign import ccall unsafe "stat_is_directory"
c_stat_is_directory :: CString -> IO CInt

-------------------------------------------------------------------------------
-- Functions
-------------------------------------------------------------------------------

foreign import ccall unsafe "closedir"
Expand Down Expand Up @@ -133,6 +145,42 @@ isMetaDir dname = do
then return True
else return False

statCheckIfDir :: PosixPath -> IO Bool
statCheckIfDir path =
Array.asCStringUnsafe (Path.toChunk path) $ \cStr -> do
res <- c_stat_is_directory cStr
case res of
x | x == 0 -> pure True
x | x == 1 -> pure False
_ -> throwErrno "checkIfDirectory"

{-# INLINE appendCString #-}
appendCString :: PosixPath -> CString -> IO PosixPath
appendCString a b = do
b1 <- Array.fromCString (castPtr b)
pure $ Path.append a (Path.unsafeFromChunk b1)

data PathClassified
= PCDir PosixPath
| PCFile PosixPath
| PCUnknown PosixPath

unClassifyPath :: PathClassified -> PosixPath
unClassifyPath (PCDir a) = a
unClassifyPath (PCFile a) = a
unClassifyPath (PCUnknown a) = a

evaluateUnknown
:: PosixPath -> PathClassified -> IO (Either PosixPath PosixPath)
evaluateUnknown _ (PCDir a) = pure $ Left a
evaluateUnknown _ (PCFile a) = pure $ Right a
evaluateUnknown parent (PCUnknown child) = do
statIsDir <- statCheckIfDir $ Path.append parent child
pure
$ if statIsDir
then Left child
else Right child

-- XXX We can use getdents64 directly so that we can use array slices from the
-- same buffer that we passed to the OS. That way we can also avoid any
-- overhead of bracket.
Expand All @@ -141,7 +189,7 @@ isMetaDir dname = do
-- {-# INLINE readDirStreamEither #-}
readDirStreamEither ::
-- DirStream -> IO (Either (Rel (Dir Path)) (Rel (File Path)))
DirStream -> IO (Maybe (Either PosixPath PosixPath))
DirStream -> IO (Maybe PathClassified)
readDirStreamEither (DirStream dirp) = loop

where
Expand All @@ -168,8 +216,10 @@ readDirStreamEither (DirStream dirp) = loop
isMeta <- isMetaDir dname
if isMeta
then loop
else return (Just (Left (mkPath name)))
else return (Just (Right (mkPath name)))
else return (Just (PCDir (mkPath name)))
else if (dtype == #const DT_UNKNOWN)
then pure (Just (PCUnknown (mkPath name)))
else return (Just (PCFile (mkPath name)))
else do
errno <- getErrno
if (errno == eINTR)
Expand Down Expand Up @@ -208,9 +258,6 @@ readEitherChunks alldirs =
dirMax = 4
fileMax = 1000

mkPath :: Array Word8 -> PosixPath
mkPath = Path.unsafeFromChunk

step _ (ChunkStreamInit (x:xs) dirs ndirs files nfiles) = do
DirStream dirp <- liftIO $ openDirStream x
return $ Skip (ChunkStreamLoop x xs dirp dirs ndirs files nfiles)
Expand All @@ -233,10 +280,12 @@ readEitherChunks alldirs =
dtype :: #{type unsigned char} <-
liftIO $ #{peek struct dirent, d_type} dentPtr

name <- Array.fromCString (castPtr dname)
let path = Path.append curdir (mkPath name)

if (dtype == (#const DT_DIR))
path <- liftIO $ appendCString curdir dname
statIsDir <-
if dtype == #const DT_UNKNOWN
then liftIO $ statCheckIfDir path
else pure False
if dtype == (#const DT_DIR) || statIsDir
then do
isMeta <- liftIO $ isMetaDir dname
if isMeta
Expand Down Expand Up @@ -330,9 +379,6 @@ readEitherByteChunks alldirs =
-- from the output channel, then consume that stream by using a monad bind.
bufSize = 4000

mkPath :: Array Word8 -> PosixPath
mkPath = Path.unsafeFromChunk

copyToBuf dstArr pos dirPath name = do
nameLen <- fmap fromIntegral (liftIO $ c_strlen name)
let PosixPath (Array dirArr start end) = dirPath
Expand Down Expand Up @@ -399,7 +445,11 @@ readEitherByteChunks alldirs =
-- XXX Skips come around the entire loop, does that impact perf
-- because it has a StreamK in the middle.
-- Keep the file check first as it is more likely
if (dtype /= (#const DT_DIR))
statIsDir <-
if dtype == #const DT_UNKNOWN
then liftIO (appendCString curdir dname >>= statCheckIfDir)
else pure False
if dtype /= (#const DT_DIR) && not statIsDir
then do
r <- copyToBuf mbarr pos curdir dname
case r of
Expand All @@ -419,9 +469,8 @@ readEitherByteChunks alldirs =
if isMeta
then return $ Skip st
else do
name <- Array.fromCString (castPtr dname)
let path = Path.append curdir (mkPath name)
dirs1 = path : dirs
path <- liftIO $ appendCString curdir dname
let dirs1 = path : dirs
ndirs1 = ndirs + 1
r <- copyToBuf mbarr pos curdir dname
case r of
Expand Down
22 changes: 19 additions & 3 deletions core/src/Streamly/Internal/FileSystem/Windows/ReadDir.hsc
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ module Streamly.Internal.FileSystem.Windows.ReadDir
(
#if defined(mingw32_HOST_OS) || defined(__MINGW32__)
DirStream
, PathClassified(..)
, unClassifyPath
, evaluateUnknown
, openDirStream
, closeDirStream
, readDirStreamEither
Expand Down Expand Up @@ -57,6 +60,19 @@ type LPCTSTR = Ptr CWchar
type WIN32_FIND_DATA = ()
type HANDLE = Ptr ()

------------------------------------------------------------------------------
-- Commonization helpers
------------------------------------------------------------------------------

type PathClassified = Either WindowsPath WindowsPath

unClassifyPath :: PathClassified -> WindowsPath
unClassifyPath = either id id

evaluateUnknown
:: WindowsPath -> PathClassified -> IO (Either WindowsPath WindowsPath)
evaluateUnknown _ = pure

------------------------------------------------------------------------------
-- Windows C APIs
------------------------------------------------------------------------------
Expand Down Expand Up @@ -104,7 +120,7 @@ failWith fn_name err_code = do
c_msg <- getErrorMessage err_code
msg <- if c_msg == nullPtr
then return $ "Error 0x" ++ Numeric.showHex err_code ""
else do
else do
msg <- peekCWString c_msg
-- We ignore failure of freeing c_msg, given we're already failing
_ <- localFree c_msg
Expand Down Expand Up @@ -145,8 +161,8 @@ openDirStream p = do
Array.asCStringUnsafe (Path.toChunk path) $ \pathPtr -> do
-- XXX Use getLastError to distinguish the case when no
-- matching file is found. See the doc of FindFirstFileW.
failIf
(== iNVALID_HANDLE_VALUE)
failIf
(== iNVALID_HANDLE_VALUE)
("FindFirstFileW: " ++ Path.toString path)
$ c_FindFirstFileW (castPtr pathPtr) dataPtr
ref <- newIORef True
Expand Down
1 change: 1 addition & 0 deletions core/streamly-core.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,7 @@ library
, src/Streamly/Internal/Data/Stream

c-sources: src/Streamly/Internal/Data/MutArray/Lib.c
, src/Streamly/Internal/FileSystem/Posix/ReadDir.c

-- Prefer OS conditionals inside the source files rather than here,
-- conditionals here do not work well with cabal2nix.
Expand Down

0 comments on commit a0dbf32

Please sign in to comment.