1 {-# LANGUAGE CPP, ScopedTypeVariables, MultiParamTypeClasses #-} 2 3 -- | This module contains plain tree indexing code. The index itself is a 4 -- CACHE: you should only ever use it as an optimisation and never as a primary 5 -- storage. In practice, this means that when we change index format, the 6 -- application is expected to throw the old index away and build a fresh 7 -- index. Please note that tracking index validity is out of scope for this 8 -- library: this is responsibility of your application. It is advisable that in 9 -- your validity tracking code, you also check for format validity (see 10 -- 'indexFormatValid') and scrap and re-create index when needed. 11 -- 12 -- The index is a binary file that overlays a hashed tree over the working 13 -- copy. This means that every working file and directory has an entry in the 14 -- index, that contains its path and hash and validity data. The validity data 15 -- is a timestamp plus the file size. The file hashes are sha256's of the 16 -- file's content. 17 -- 18 -- There are two entry types, a file entry and a directory entry. Both have a 19 -- common binary format (see 'Item'). The on-disk format is best described by 20 -- the section /Index format/ below. 21 -- 22 -- For each file, the index has a copy of the file's last modification 23 -- timestamp taken at the instant when the hash has been computed. This means 24 -- that when file size and timestamp of a file in working copy matches those in 25 -- the index, we assume that the hash stored in the index for given file is 26 -- valid. These hashes are then exposed in the resulting 'Tree' object, and can 27 -- be leveraged by eg. 'diffTrees' to compare many files quickly. 28 -- 29 -- You may have noticed that we also keep hashes of directories. These are 30 -- assumed to be valid whenever the complete subtree has been valid. At any 31 -- point, as soon as a size or timestamp mismatch is found, the working file in 32 -- question is opened, its hash (and timestamp and size) is recomputed and 33 -- updated in-place in the index file (everything lives at a fixed offset and 34 -- is fixed size, so this isn't an issue). This is also true of directories: 35 -- when a file in a directory changes hash, this triggers recomputation of all 36 -- of its parent directory hashes; moreover this is done efficiently -- each 37 -- directory is updated at most once during an update run. 38 -- 39 -- /Index format/ 40 -- 41 -- The Index is organised into \"lines\" where each line describes a single 42 -- indexed item. Cf. 'Item'. 43 -- 44 -- The first word on the index \"line\" is the length of the file path (which is 45 -- the only variable-length part of the line). Then comes the path itself, then 46 -- fixed-length hash (sha256) of the file in question, then two words, one for 47 -- size and one "aux", which is used differently for directories and for files. 48 -- 49 -- With directories, this aux holds the offset of the next sibling line in the 50 -- index, so we can efficiently skip reading the whole subtree starting at a 51 -- given directory (by just seeking aux bytes forward). The lines are 52 -- pre-ordered with respect to directory structure -- the directory comes first 53 -- and after it come all its items. Cf. 'readIndex''. 54 -- 55 -- For files, the aux field holds a timestamp. 56 57 module Storage.Hashed.Index( readIndex, updateIndexFrom, indexFormatValid 58 , updateIndex , Index, filter ) 59 where 60 61 import Prelude hiding ( lookup, readFile, writeFile, catch, filter ) 62 import Storage.Hashed.Utils 63 import Storage.Hashed.Tree 64 import Storage.Hashed.Path 65 import Data.Int( Int64, Int32 ) 66 67 import Bundled.Posix( getFileStatusBS, modificationTime, 68 getFileStatus, fileSize, fileExists ) 69 import System.IO.MMap( mmapFileForeignPtr, mmapFileByteString, Mode(..) ) 70 import System.IO( ) 71 import System.Directory( doesFileExist, getCurrentDirectory ) 72 #if mingw32_HOST_OS 73 import System.Directory( renameFile ) 74 import System.FilePath( (<.>) ) 75 #else 76 import System.Directory( removeFile ) 77 #endif 78 import qualified System.FilePath as FP( (</>) ) 79 80 import Control.Monad( when ) 81 import Control.Exception.Extensible 82 import Control.Applicative( (<$>) ) 83 84 import qualified Data.ByteString.Lazy as BL 85 import qualified Data.ByteString as BS 86 import qualified Data.ByteString.Char8 as BSC 87 import Data.ByteString.Unsafe( unsafeHead, unsafeDrop ) 88 import Data.ByteString.Internal( toForeignPtr, fromForeignPtr, memcpy 89 , nullForeignPtr, c2w ) 90 91 import Data.IORef( ) 92 import Data.Maybe( fromJust, isJust ) 93 import Data.Bits( Bits ) 94 95 import Foreign.Storable 96 import Foreign.ForeignPtr 97 import Foreign.Ptr 98 99 import Storage.Hashed.Hash( sha256, rawHash ) 100 101 -------------------------- 102 -- Indexed trees 103 -- 104 105 -- | Description of a a single indexed item. The structure itself does not 106 -- contain any data, just pointers to the underlying mmap (bytestring is a 107 -- pointer + offset + length). 108 -- 109 -- The structure is recursive-ish (as opposed to flat-ish structure, which is 110 -- used by git...) It turns out that it's hard to efficiently read a flat index 111 -- with our internal data structures -- we need to turn the flat index into a 112 -- recursive Tree object, which is rather expensive... As a bonus, we can also 113 -- efficiently implement subtree queries this way (cf. 'readIndex'). 114 data Item = Item { iBase :: !(Ptr ()) 115 , iHashAndDescriptor :: !BS.ByteString 116 } deriving Show 117 118 size_magic :: Int 119 size_magic = 4 -- the magic word, first 4 bytes of the index 120 121 size_dsclen, size_hash, size_size, size_aux :: Int 122 size_size = 8 -- file/directory size (Int64) 123 size_aux = 8 -- aux (Int64) 124 size_dsclen = 4 -- this many bytes store the length of the path 125 size_hash = 32 -- hash representation 126 127 off_size, off_aux, off_hash, off_dsc, off_dsclen :: Int 128 off_size = 0 129 off_aux = off_size + size_size 130 off_dsclen = off_aux + size_aux 131 off_hash = off_dsclen + size_dsclen 132 off_dsc = off_hash + size_hash 133 134 itemAllocSize :: Relative -> Int 135 itemAllocSize apath = 136 align 4 $ size_hash + size_size + size_aux + size_dsclen + 2 + BS.length (pathToBS apath) 137 138 itemSize, itemNext :: Item -> Int 139 itemSize i = size_size + size_aux + size_dsclen + (BS.length $ iHashAndDescriptor i) 140 itemNext i = align 4 (itemSize i + 1) 141 142 iPath, iHash, iDescriptor :: Item -> BS.ByteString 143 iDescriptor = unsafeDrop size_hash . iHashAndDescriptor 144 iPath = unsafeDrop 1 . iDescriptor 145 iHash = BS.take size_hash . iHashAndDescriptor 146 147 iSize, iAux :: Item -> Ptr Int64 148 iSize i = plusPtr (iBase i) off_size 149 iAux i = plusPtr (iBase i) off_aux 150 151 itemIsDir :: Item -> Bool 152 itemIsDir i = unsafeHead (iDescriptor i) == c2w 'D' 153 154 -- xlatePeek32 = fmap xlate32 . peek 155 xlatePeek64 :: (Storable a, Bits a) => Ptr a -> IO a 156 xlatePeek64 = fmap xlate64 . peek 157 158 -- xlatePoke32 ptr v = poke ptr (xlate32 v) 159 xlatePoke64 :: (Storable a, Bits a) => Ptr a -> a -> IO () 160 xlatePoke64 ptr v = poke ptr (xlate64 v) 161 162 -- | Lay out the basic index item structure in memory. The memory location is 163 -- given by a ForeignPointer () and an offset. The path and type given are 164 -- written out, and a corresponding Item is given back. The remaining bits of 165 -- the item can be filled out using 'update'. 166 createItem :: ItemType -> Relative -> ForeignPtr () -> Int -> IO Item 167 createItem typ apath fp off = 168 do let dsc = BS.concat [ BSC.singleton $ if typ == TreeType then 'D' else 'F' 169 , pathToBS apath 170 , BS.singleton 0 ] 171 (dsc_fp, dsc_start, dsc_len) = toForeignPtr dsc 172 withForeignPtr fp $ \p -> 173 withForeignPtr dsc_fp $ \dsc_p -> 174 do pokeByteOff p (off + off_dsclen) (xlate32 $ fromIntegral dsc_len :: Int32) 175 memcpy (plusPtr p $ off + off_dsc) 176 (plusPtr dsc_p dsc_start) 177 (fromIntegral dsc_len) 178 peekItem fp off 179 180 -- | Read the on-disk representation into internal data structure. 181 -- 182 -- See the module-level section /Index format/ for details on how the index 183 -- is structured. 184 peekItem :: ForeignPtr () -> Int -> IO Item 185 peekItem fp off = 186 withForeignPtr fp $ \p -> do 187 nl' :: Int32 <- xlate32 `fmap` peekByteOff p (off + off_dsclen) 188 when (nl' <= 2) $ fail "Descriptor too short in peekItem!" 189 let nl = fromIntegral nl' 190 dsc = fromForeignPtr (castForeignPtr fp) (off + off_hash) (size_hash + nl - 1) 191 return $! Item { iBase = plusPtr p off 192 , iHashAndDescriptor = dsc } 193 194 -- | Update an existing item with new hash and optionally mtime (give Nothing 195 -- when updating directory entries). 196 updateItem :: Item -> Int64 -> Hash -> IO () 197 updateItem item _ NoHash = 198 fail $ "Index.update NoHash: " ++ BSC.unpack (iPath item) 199 updateItem item size hash = 200 do xlatePoke64 (iSize item) size 201 unsafePokeBS (iHash item) (rawHash hash) 202 203 updateAux :: Item -> Int64 -> IO () 204 updateAux item aux = xlatePoke64 (iAux item) $ aux 205 updateTime :: forall a.(Enum a) => Item -> a -> IO () 206 updateTime item mtime = updateAux item (fromIntegral $ fromEnum mtime) 207 208 iHash' :: Item -> Hash 209 iHash' i = SHA256 (iHash i) 210 211 -- | Gives a ForeignPtr to mmapped index, which can be used for reading and 212 -- updates. The req_size parameter, if non-0, expresses the requested size of 213 -- the index file. mmapIndex will grow the index if it is smaller than this. 214 mmapIndex :: forall a. FilePath -> Int -> IO (ForeignPtr a, Int) 215 mmapIndex indexpath req_size = do 216 exist <- doesFileExist indexpath 217 act_size <- fromIntegral `fmap` if exist then fileSize `fmap` getFileStatus indexpath 218 else return 0 219 let size = case req_size > 0 of 220 True -> req_size 221 False | act_size >= size_magic -> act_size - size_magic 222 | otherwise -> 0 223 case size of 224 0 -> return (castForeignPtr nullForeignPtr, size) 225 _ -> do (x, _, _) <- mmapFileForeignPtr indexpath 226 ReadWriteEx (Just (0, size + size_magic)) 227 return (x, size) 228 229 data IndexM m = Index { mmap :: (ForeignPtr ()) 230 , basedir :: FilePath 231 , hashtree :: Tree m -> Hash 232 , predicate :: Relative -> TreeItem m -> Bool } 233 | EmptyIndex 234 235 type Index = IndexM IO 236 237 data State = State { dirlength :: !Int 238 , path :: !Relative 239 , start :: !Int } 240 241 data Result = Result { changed :: !Bool 242 , next :: !Int 243 , treeitem :: !(Maybe (TreeItem IO)) 244 , resitem :: !Item } 245 246 readItem :: Index -> State -> IO Result 247 readItem index state = do 248 item <- peekItem (mmap index) (start state) 249 res' <- if itemIsDir item 250 then readDir index state item 251 else readFile index state item 252 return res' 253 254 readDir :: Index -> State -> Item -> IO Result 255 readDir index state item = 256 do following <- fromIntegral <$> xlatePeek64 (iAux item) 257 exists <- fileExists <$> getFileStatusBS (iPath item) 258 let name it dirlen = (BS.drop (dirlen + 1) $ iDescriptor it) -- FIXME MAGIC 259 namelength = (BS.length $ iDescriptor item) - (dirlength state) 260 myname = name item (dirlength state) 261 substate = state { start = start state + itemNext item 262 , path = path state </> myname 263 , dirlength = if myname == (BSC.singleton '.') 264 then dirlength state 265 else dirlength state + namelength } 266 267 want = exists && (predicate index) (path substate) (Stub undefined NoHash) 268 oldhash = iHash' item 269 270 subs off | off < following = do 271 result <- readItem index $ substate { start = off } 272 rest <- subs $ next result 273 return $! (name (resitem result) $ dirlength substate, result) : rest 274 subs coff | coff == following = return [] 275 | otherwise = fail $ "Offset mismatch at " ++ show coff ++ 276 " (ends at " ++ show following ++ ")" 277 278 inferiors <- if want then subs $ start substate 279 else return [] 280 281 let we_changed = or [ changed x | (_, x) <- inferiors ] || nullleaf 282 nullleaf = null inferiors && oldhash == nullsha 283 nullsha = SHA256 (BS.replicate 32 0) 284 tree' = makeTree [ (n, fromJust $ treeitem s) | (n, s) <- inferiors, isJust $ treeitem s ] 285 treehash = if we_changed then hashtree index tree' else oldhash 286 tree = tree' { treeHash = treehash } 287 288 when we_changed $ updateItem item 0 treehash 289 return $ Result { changed = not exists || we_changed 290 , next = following 291 , treeitem = if want then Just $ SubTree tree 292 else Nothing 293 , resitem = item } 294 295 readFile :: Index -> State -> Item -> IO Result 296 readFile index state item = 297 do st <- getFileStatusBS (iPath item) 298 mtime <- fromIntegral <$> (xlatePeek64 $ iAux item) 299 size <- xlatePeek64 $ iSize item 300 let mtime' = modificationTime st 301 size' = fromIntegral $ fileSize st 302 readblob = readSegment (basedir index FP.</> BSC.unpack (iPath item), Nothing) 303 exists = fileExists st 304 we_changed = mtime /= mtime' || size /= size' 305 hash = iHash' item 306 when we_changed $ 307 do hash' <- sha256 `fmap` readblob 308 updateItem item size' hash' 309 updateTime item mtime' 310 return $ Result { changed = not exists || we_changed 311 , next = start state + itemNext item 312 , treeitem = if exists then Just $ File $ Blob readblob hash else Nothing 313 , resitem = item } 314 315 updateIndex :: Index -> IO (Tree IO) 316 updateIndex EmptyIndex = return emptyTree 317 updateIndex index = 318 do let initial = State { start = size_magic 319 , dirlength = 0 320 , path = root } 321 res <- readItem index initial 322 case treeitem res of 323 Just (SubTree tree) -> return $ filter (predicate index) tree 324 _ -> fail "Unexpected failure in updateIndex!" 325 326 -- | Read an index and build up a 'Tree' object from it, referring to current 327 -- working directory. The initial Index object returned by readIndex is not 328 -- directly useful. However, you can use 'Tree.filter' on it. Either way, to 329 -- obtain the actual Tree object, call update. 330 -- 331 -- The usual use pattern is this: 332 -- 333 -- > do (idx, update) <- readIndex 334 -- > tree <- update =<< filter predicate idx 335 -- 336 -- The resulting tree will be fully expanded. 337 readIndex :: FilePath -> (Tree IO -> Hash) -> IO Index 338 readIndex indexpath ht = do 339 (mmap_ptr, mmap_size) <- mmapIndex indexpath 0 340 base <- getCurrentDirectory 341 return $ if mmap_size == 0 then EmptyIndex 342 else Index { mmap = mmap_ptr 343 , basedir = base 344 , hashtree = ht 345 , predicate = \_ _ -> True } 346 347 formatIndex :: ForeignPtr () -> Tree IO -> Tree IO -> IO () 348 formatIndex mmap_ptr old reference = 349 do create (SubTree reference) root size_magic 350 unsafePokeBS magic (BSC.pack "HSI4") 351 where magic = fromForeignPtr (castForeignPtr mmap_ptr) 0 4 352 create (File _) path' off = 353 do i <- createItem BlobType path' mmap_ptr off 354 let flatpath = BSC.unpack $ pathToBS path' 355 case find old path' of 356 Nothing -> return () 357 -- TODO calling getFileStatus here is both slightly 358 -- inefficient and slightly race-prone 359 Just ti -> do st <- getFileStatus flatpath 360 let hash = itemHash ti 361 mtime = modificationTime st 362 size = fileSize st 363 updateItem i (fromIntegral size) hash 364 updateTime i mtime 365 return $ off + itemNext i 366 create (SubTree s) path' off = 367 do i <- createItem TreeType path' mmap_ptr off 368 case find old path' of 369 Nothing -> return () 370 Just ti | itemHash ti == NoHash -> return () 371 | otherwise -> updateItem i 0 $ itemHash ti 372 let subs [] = return $ off + itemNext i 373 subs ((name,x):xs) = do 374 let path'' = path' </> name 375 noff <- subs xs 376 create x path'' noff 377 lastOff <- subs (listImmediate s) 378 xlatePoke64 (iAux i) (fromIntegral lastOff) 379 return lastOff 380 create (Stub _ _) path' _ = 381 fail $ "Cannot create index from stubbed Tree at " ++ show path' 382 383 -- | Will add and remove files in index to make it match the 'Tree' object 384 -- given (it is an error for the 'Tree' to contain a file or directory that 385 -- does not exist in a plain form in current working directory). 386 updateIndexFrom :: FilePath -> (Tree IO -> Hash) -> Tree IO -> IO Index 387 updateIndexFrom indexpath hashtree' ref = 388 do old_idx <- updateIndex =<< readIndex indexpath hashtree' 389 reference <- expand ref 390 let len_root = itemAllocSize root 391 len = len_root + sum [ itemAllocSize p | (p, _) <- list reference ] 392 exist <- doesFileExist indexpath 393 #if mingw32_HOST_OS 394 when exist $ renameFile indexpath (indexpath <.> "old") 395 #else 396 when exist $ removeFile indexpath -- to avoid clobbering oldidx 397 #endif 398 (mmap_ptr, _) <- mmapIndex indexpath len 399 formatIndex mmap_ptr old_idx reference 400 readIndex indexpath hashtree' 401 402 -- | Check that a given file is an index file with a format we can handle. You 403 -- should remove and re-create the index whenever this is not true. 404 indexFormatValid :: FilePath -> IO Bool 405 indexFormatValid path' = 406 do magic <- mmapFileByteString path' (Just (0, size_magic)) 407 return $ case BSC.unpack magic of 408 "HSI4" -> True 409 _ -> False 410 `catch` \(_::SomeException) -> return False 411 412 instance FilterTree IndexM IO where 413 filter _ EmptyIndex = EmptyIndex 414 filter p index = index { predicate = \a b -> predicate index a b && p a b }