1 {-# LANGUAGE CPP, ScopedTypeVariables, MultiParamTypeClasses #-}
    2 
    3 -- | This module contains plain tree indexing code. The index itself is a
    4 -- CACHE: you should only ever use it as an optimisation and never as a primary
    5 -- storage. In practice, this means that when we change index format, the
    6 -- application is expected to throw the old index away and build a fresh
    7 -- index. Please note that tracking index validity is out of scope for this
    8 -- library: this is responsibility of your application. It is advisable that in
    9 -- your validity tracking code, you also check for format validity (see
   10 -- 'indexFormatValid') and scrap and re-create index when needed.
   11 --
   12 -- The index is a binary file that overlays a hashed tree over the working
   13 -- copy. This means that every working file and directory has an entry in the
   14 -- index, that contains its path and hash and validity data. The validity data
   15 -- is a timestamp plus the file size. The file hashes are sha256's of the
   16 -- file's content.
   17 --
   18 -- There are two entry types, a file entry and a directory entry. Both have a
   19 -- common binary format (see 'Item'). The on-disk format is best described by
   20 -- the section /Index format/ below.
   21 --
   22 -- For each file, the index has a copy of the file's last modification
   23 -- timestamp taken at the instant when the hash has been computed. This means
   24 -- that when file size and timestamp of a file in working copy matches those in
   25 -- the index, we assume that the hash stored in the index for given file is
   26 -- valid. These hashes are then exposed in the resulting 'Tree' object, and can
   27 -- be leveraged by eg.  'diffTrees' to compare many files quickly.
   28 --
   29 -- You may have noticed that we also keep hashes of directories. These are
   30 -- assumed to be valid whenever the complete subtree has been valid. At any
   31 -- point, as soon as a size or timestamp mismatch is found, the working file in
   32 -- question is opened, its hash (and timestamp and size) is recomputed and
   33 -- updated in-place in the index file (everything lives at a fixed offset and
   34 -- is fixed size, so this isn't an issue). This is also true of directories:
   35 -- when a file in a directory changes hash, this triggers recomputation of all
   36 -- of its parent directory hashes; moreover this is done efficiently -- each
   37 -- directory is updated at most once during an update run.
   38 --
   39 -- /Index format/
   40 --
   41 -- The Index is organised into \"lines\" where each line describes a single
   42 -- indexed item. Cf. 'Item'.
   43 --
   44 -- The first word on the index \"line\" is the length of the file path (which is
   45 -- the only variable-length part of the line). Then comes the path itself, then
   46 -- fixed-length hash (sha256) of the file in question, then two words, one for
   47 -- size and one "aux", which is used differently for directories and for files.
   48 --
   49 -- With directories, this aux holds the offset of the next sibling line in the
   50 -- index, so we can efficiently skip reading the whole subtree starting at a
   51 -- given directory (by just seeking aux bytes forward). The lines are
   52 -- pre-ordered with respect to directory structure -- the directory comes first
   53 -- and after it come all its items. Cf. 'readIndex''.
   54 --
   55 -- For files, the aux field holds a timestamp.
   56 
   57 module Storage.Hashed.Index( readIndex, updateIndexFrom, indexFormatValid
   58                            , updateIndex , Index, filter )
   59     where
   60 
   61 import Prelude hiding ( lookup, readFile, writeFile, catch, filter )
   62 import Storage.Hashed.Utils
   63 import Storage.Hashed.Tree
   64 import Storage.Hashed.Path
   65 import Data.Int( Int64, Int32 )
   66 
   67 import Bundled.Posix( getFileStatusBS, modificationTime,
   68                       getFileStatus, fileSize, fileExists )
   69 import System.IO.MMap( mmapFileForeignPtr, mmapFileByteString, Mode(..) )
   70 import System.IO( )
   71 import System.Directory( doesFileExist, getCurrentDirectory )
   72 #if mingw32_HOST_OS
   73 import System.Directory( renameFile )
   74 import System.FilePath( (<.>) )
   75 #else
   76 import System.Directory( removeFile )
   77 #endif
   78 import qualified System.FilePath as FP( (</>) )
   79 
   80 import Control.Monad( when )
   81 import Control.Exception.Extensible
   82 import Control.Applicative( (<$>) )
   83 
   84 import qualified Data.ByteString.Lazy as BL
   85 import qualified Data.ByteString as BS
   86 import qualified Data.ByteString.Char8 as BSC
   87 import Data.ByteString.Unsafe( unsafeHead, unsafeDrop )
   88 import Data.ByteString.Internal( toForeignPtr, fromForeignPtr, memcpy
   89                                , nullForeignPtr, c2w )
   90 
   91 import Data.IORef( )
   92 import Data.Maybe( fromJust, isJust )
   93 import Data.Bits( Bits )
   94 
   95 import Foreign.Storable
   96 import Foreign.ForeignPtr
   97 import Foreign.Ptr
   98 
   99 import Storage.Hashed.Hash( sha256, rawHash )
  100 
  101 --------------------------
  102 -- Indexed trees
  103 --
  104 
  105 -- | Description of a a single indexed item. The structure itself does not
  106 -- contain any data, just pointers to the underlying mmap (bytestring is a
  107 -- pointer + offset + length).
  108 --
  109 -- The structure is recursive-ish (as opposed to flat-ish structure, which is
  110 -- used by git...) It turns out that it's hard to efficiently read a flat index
  111 -- with our internal data structures -- we need to turn the flat index into a
  112 -- recursive Tree object, which is rather expensive... As a bonus, we can also
  113 -- efficiently implement subtree queries this way (cf. 'readIndex').
  114 data Item = Item { iBase :: !(Ptr ())
  115                  , iHashAndDescriptor :: !BS.ByteString
  116                  } deriving Show
  117 
  118 size_magic :: Int
  119 size_magic = 4 -- the magic word, first 4 bytes of the index
  120 
  121 size_dsclen, size_hash, size_size, size_aux :: Int
  122 size_size = 8 -- file/directory size (Int64)
  123 size_aux = 8 -- aux (Int64)
  124 size_dsclen = 4 -- this many bytes store the length of the path
  125 size_hash = 32 -- hash representation
  126 
  127 off_size, off_aux, off_hash, off_dsc, off_dsclen :: Int
  128 off_size = 0
  129 off_aux = off_size + size_size
  130 off_dsclen = off_aux + size_aux
  131 off_hash = off_dsclen + size_dsclen
  132 off_dsc = off_hash + size_hash
  133 
  134 itemAllocSize :: Relative -> Int
  135 itemAllocSize apath =
  136     align 4 $ size_hash + size_size + size_aux + size_dsclen + 2 + BS.length (pathToBS apath)
  137 
  138 itemSize, itemNext :: Item -> Int
  139 itemSize i = size_size + size_aux + size_dsclen + (BS.length $ iHashAndDescriptor i)
  140 itemNext i = align 4 (itemSize i + 1)
  141 
  142 iPath, iHash, iDescriptor :: Item -> BS.ByteString
  143 iDescriptor = unsafeDrop size_hash . iHashAndDescriptor
  144 iPath = unsafeDrop 1 . iDescriptor
  145 iHash = BS.take size_hash . iHashAndDescriptor
  146 
  147 iSize, iAux :: Item -> Ptr Int64
  148 iSize i = plusPtr (iBase i) off_size
  149 iAux i = plusPtr (iBase i) off_aux
  150 
  151 itemIsDir :: Item -> Bool
  152 itemIsDir i = unsafeHead (iDescriptor i) == c2w 'D'
  153 
  154 -- xlatePeek32 = fmap xlate32 . peek
  155 xlatePeek64 :: (Storable a, Bits a) => Ptr a -> IO a
  156 xlatePeek64 = fmap xlate64 . peek
  157 
  158 -- xlatePoke32 ptr v = poke ptr (xlate32 v)
  159 xlatePoke64 :: (Storable a, Bits a) => Ptr a -> a -> IO ()
  160 xlatePoke64 ptr v = poke ptr (xlate64 v)
  161 
  162 -- | Lay out the basic index item structure in memory. The memory location is
  163 -- given by a ForeignPointer () and an offset. The path and type given are
  164 -- written out, and a corresponding Item is given back. The remaining bits of
  165 -- the item can be filled out using 'update'.
  166 createItem :: ItemType -> Relative -> ForeignPtr () -> Int -> IO Item
  167 createItem typ apath fp off =
  168  do let dsc = BS.concat [ BSC.singleton $ if typ == TreeType then 'D' else 'F'
  169                         , pathToBS apath
  170                         , BS.singleton 0 ]
  171         (dsc_fp, dsc_start, dsc_len) = toForeignPtr dsc
  172     withForeignPtr fp $ \p ->
  173         withForeignPtr dsc_fp $ \dsc_p ->
  174             do pokeByteOff p (off + off_dsclen) (xlate32 $ fromIntegral dsc_len :: Int32)
  175                memcpy (plusPtr p $ off + off_dsc)
  176                       (plusPtr dsc_p dsc_start)
  177                       (fromIntegral dsc_len)
  178                peekItem fp off
  179 
  180 -- | Read the on-disk representation into internal data structure.
  181 --
  182 -- See the module-level section /Index format/ for details on how the index
  183 -- is structured.
  184 peekItem :: ForeignPtr () -> Int -> IO Item
  185 peekItem fp off =
  186     withForeignPtr fp $ \p -> do
  187       nl' :: Int32 <- xlate32 `fmap` peekByteOff p (off + off_dsclen)
  188       when (nl' <= 2) $ fail "Descriptor too short in peekItem!"
  189       let nl = fromIntegral nl'
  190           dsc = fromForeignPtr (castForeignPtr fp) (off + off_hash) (size_hash + nl - 1)
  191       return $! Item { iBase = plusPtr p off
  192                      , iHashAndDescriptor = dsc }
  193 
  194 -- | Update an existing item with new hash and optionally mtime (give Nothing
  195 -- when updating directory entries).
  196 updateItem :: Item -> Int64 -> Hash -> IO ()
  197 updateItem item _ NoHash =
  198     fail $ "Index.update NoHash: " ++ BSC.unpack (iPath item)
  199 updateItem item size hash =
  200     do xlatePoke64 (iSize item) size
  201        unsafePokeBS (iHash item) (rawHash hash)
  202 
  203 updateAux :: Item -> Int64 -> IO ()
  204 updateAux item aux = xlatePoke64 (iAux item) $ aux
  205 updateTime :: forall a.(Enum a) => Item -> a -> IO ()
  206 updateTime item mtime = updateAux item (fromIntegral $ fromEnum mtime)
  207 
  208 iHash' :: Item -> Hash
  209 iHash' i = SHA256 (iHash i)
  210 
  211 -- | Gives a ForeignPtr to mmapped index, which can be used for reading and
  212 -- updates. The req_size parameter, if non-0, expresses the requested size of
  213 -- the index file. mmapIndex will grow the index if it is smaller than this.
  214 mmapIndex :: forall a. FilePath -> Int -> IO (ForeignPtr a, Int)
  215 mmapIndex indexpath req_size = do
  216   exist <- doesFileExist indexpath
  217   act_size <- fromIntegral `fmap` if exist then fileSize `fmap` getFileStatus indexpath
  218                                            else return 0
  219   let size = case req_size > 0 of
  220         True -> req_size
  221         False | act_size >= size_magic -> act_size - size_magic
  222               | otherwise -> 0
  223   case size of
  224     0 -> return (castForeignPtr nullForeignPtr, size)
  225     _ -> do (x, _, _) <- mmapFileForeignPtr indexpath
  226                                             ReadWriteEx (Just (0, size + size_magic))
  227             return (x, size)
  228 
  229 data IndexM m = Index { mmap :: (ForeignPtr ())
  230                       , basedir :: FilePath
  231                       , hashtree :: Tree m -> Hash
  232                       , predicate :: Relative -> TreeItem m -> Bool }
  233               | EmptyIndex
  234 
  235 type Index = IndexM IO
  236 
  237 data State = State { dirlength :: !Int
  238                    , path :: !Relative
  239                    , start :: !Int }
  240 
  241 data Result = Result { changed :: !Bool
  242                      , next :: !Int
  243                      , treeitem :: !(Maybe (TreeItem IO))
  244                      , resitem :: !Item }
  245 
  246 readItem :: Index -> State -> IO Result
  247 readItem index state = do
  248   item <- peekItem (mmap index) (start state)
  249   res' <- if itemIsDir item
  250               then readDir  index state item
  251               else readFile index state item
  252   return res'
  253 
  254 readDir :: Index -> State -> Item -> IO Result
  255 readDir index state item =
  256     do following <- fromIntegral <$> xlatePeek64 (iAux item)
  257        exists <- fileExists <$> getFileStatusBS (iPath item)
  258        let name it dirlen = (BS.drop (dirlen + 1) $ iDescriptor it) -- FIXME MAGIC
  259            namelength = (BS.length $ iDescriptor item) - (dirlength state)
  260            myname = name item (dirlength state)
  261            substate = state { start = start state + itemNext item
  262                             , path = path state </> myname
  263                             , dirlength = if myname == (BSC.singleton '.')
  264                                              then dirlength state
  265                                              else dirlength state + namelength }
  266 
  267            want = exists && (predicate index) (path substate) (Stub undefined NoHash)
  268            oldhash = iHash' item
  269 
  270            subs off | off < following = do
  271              result <- readItem index $ substate { start = off }
  272              rest <- subs $ next result
  273              return $! (name (resitem result) $ dirlength substate, result) : rest
  274            subs coff | coff == following = return []
  275                      | otherwise = fail $ "Offset mismatch at " ++ show coff ++
  276                                           " (ends at " ++ show following ++ ")"
  277 
  278        inferiors <- if want then subs $ start substate
  279                             else return []
  280 
  281        let we_changed = or [ changed x | (_, x) <- inferiors ] || nullleaf
  282            nullleaf = null inferiors && oldhash == nullsha
  283            nullsha = SHA256 (BS.replicate 32 0)
  284            tree' = makeTree [ (n, fromJust $ treeitem s) | (n, s) <- inferiors, isJust $ treeitem s ]
  285            treehash = if we_changed then hashtree index tree' else oldhash
  286            tree = tree' { treeHash = treehash }
  287 
  288        when we_changed $ updateItem item 0 treehash
  289        return $ Result { changed = not exists || we_changed
  290                        , next = following
  291                        , treeitem = if want then Just $ SubTree tree
  292                                             else Nothing
  293                        , resitem = item }
  294 
  295 readFile :: Index -> State -> Item -> IO Result
  296 readFile index state item =
  297     do st <- getFileStatusBS (iPath item)
  298        mtime <- fromIntegral <$> (xlatePeek64 $ iAux item)
  299        size <- xlatePeek64 $ iSize item
  300        let mtime' = modificationTime st
  301            size' = fromIntegral $ fileSize st
  302            readblob = readSegment (basedir index FP.</> BSC.unpack (iPath item), Nothing)
  303            exists = fileExists st
  304            we_changed = mtime /= mtime' || size /= size'
  305            hash = iHash' item
  306        when we_changed $
  307             do hash' <- sha256 `fmap` readblob
  308                updateItem item size' hash'
  309                updateTime item mtime'
  310        return $ Result { changed = not exists || we_changed
  311                        , next = start state + itemNext item
  312                        , treeitem = if exists then Just $ File $ Blob readblob hash else Nothing
  313                        , resitem = item }
  314 
  315 updateIndex :: Index -> IO (Tree IO)
  316 updateIndex EmptyIndex = return emptyTree
  317 updateIndex index =
  318     do let initial = State { start = size_magic
  319                            , dirlength = 0
  320                            , path = root }
  321        res <- readItem index initial
  322        case treeitem res of
  323          Just (SubTree tree) -> return $ filter (predicate index) tree
  324          _ -> fail "Unexpected failure in updateIndex!"
  325 
  326 -- | Read an index and build up a 'Tree' object from it, referring to current
  327 -- working directory. The initial Index object returned by readIndex is not
  328 -- directly useful. However, you can use 'Tree.filter' on it. Either way, to
  329 -- obtain the actual Tree object, call update.
  330 --
  331 -- The usual use pattern is this:
  332 --
  333 -- > do (idx, update) <- readIndex
  334 -- >    tree <- update =<< filter predicate idx
  335 --
  336 -- The resulting tree will be fully expanded.
  337 readIndex :: FilePath -> (Tree IO -> Hash) -> IO Index
  338 readIndex indexpath ht = do
  339   (mmap_ptr, mmap_size) <- mmapIndex indexpath 0
  340   base <- getCurrentDirectory
  341   return $ if mmap_size == 0 then EmptyIndex
  342                              else Index { mmap = mmap_ptr
  343                                         , basedir = base
  344                                         , hashtree = ht
  345                                         , predicate = \_ _ -> True }
  346 
  347 formatIndex :: ForeignPtr () -> Tree IO -> Tree IO -> IO ()
  348 formatIndex mmap_ptr old reference =
  349     do create (SubTree reference) root size_magic
  350        unsafePokeBS magic (BSC.pack "HSI4")
  351     where magic = fromForeignPtr (castForeignPtr mmap_ptr) 0 4
  352           create (File _) path' off =
  353                do i <- createItem BlobType path' mmap_ptr off
  354                   let flatpath = BSC.unpack $ pathToBS path'
  355                   case find old path' of
  356                     Nothing -> return ()
  357                     -- TODO calling getFileStatus here is both slightly
  358                     -- inefficient and slightly race-prone
  359                     Just ti -> do st <- getFileStatus flatpath
  360                                   let hash = itemHash ti
  361                                       mtime = modificationTime st
  362                                       size = fileSize st
  363                                   updateItem i (fromIntegral size) hash
  364                                   updateTime i mtime
  365                   return $ off + itemNext i
  366           create (SubTree s) path' off =
  367                do i <- createItem TreeType path' mmap_ptr off
  368                   case find old path' of
  369                     Nothing -> return ()
  370                     Just ti | itemHash ti == NoHash -> return ()
  371                             | otherwise -> updateItem i 0 $ itemHash ti
  372                   let subs [] = return $ off + itemNext i
  373                       subs ((name,x):xs) = do
  374                         let path'' = path' </> name
  375                         noff <- subs xs
  376                         create x path'' noff
  377                   lastOff <- subs (listImmediate s)
  378                   xlatePoke64 (iAux i) (fromIntegral lastOff)
  379                   return lastOff
  380           create (Stub _ _) path' _ =
  381                fail $ "Cannot create index from stubbed Tree at " ++ show path'
  382 
  383 -- | Will add and remove files in index to make it match the 'Tree' object
  384 -- given (it is an error for the 'Tree' to contain a file or directory that
  385 -- does not exist in a plain form in current working directory).
  386 updateIndexFrom :: FilePath -> (Tree IO -> Hash) -> Tree IO -> IO Index
  387 updateIndexFrom indexpath hashtree' ref =
  388     do old_idx <- updateIndex =<< readIndex indexpath hashtree'
  389        reference <- expand ref
  390        let len_root = itemAllocSize root
  391            len = len_root + sum [ itemAllocSize p | (p, _) <- list reference ]
  392        exist <- doesFileExist indexpath
  393 #if mingw32_HOST_OS
  394        when exist $ renameFile indexpath (indexpath <.> "old")
  395 #else
  396        when exist $ removeFile indexpath -- to avoid clobbering oldidx
  397 #endif
  398        (mmap_ptr, _) <- mmapIndex indexpath len
  399        formatIndex mmap_ptr old_idx reference
  400        readIndex indexpath hashtree'
  401 
  402 -- | Check that a given file is an index file with a format we can handle. You
  403 -- should remove and re-create the index whenever this is not true.
  404 indexFormatValid :: FilePath -> IO Bool
  405 indexFormatValid path' =
  406     do magic <- mmapFileByteString path' (Just (0, size_magic))
  407        return $ case BSC.unpack magic of
  408                   "HSI4" -> True
  409                   _ -> False
  410     `catch` \(_::SomeException) -> return False
  411 
  412 instance FilterTree IndexM IO where
  413     filter _ EmptyIndex = EmptyIndex
  414     filter p index = index { predicate = \a b -> predicate index a b && p a b }