FileSystemEncoding.hs
1 {- GHC File system encoding handling. 2 - 3 - Copyright 2012-2021 Joey Hess <id@joeyh.name> 4 - 5 - License: BSD-2-clause 6 -} 7 8 {-# LANGUAGE CPP #-} 9 {-# OPTIONS_GHC -fno-warn-tabs #-} 10 11 module Utility.FileSystemEncoding ( 12 useFileSystemEncoding, 13 fileEncoding, 14 RawFilePath, 15 fromRawFilePath, 16 toRawFilePath, 17 decodeBL, 18 encodeBL, 19 decodeBS, 20 encodeBS, 21 truncateFilePath, 22 ) where 23 24 import qualified GHC.IO.Encoding as Encoding 25 import System.IO 26 import qualified Data.ByteString as S 27 import qualified Data.ByteString.Lazy as L 28 #ifdef mingw32_HOST_OS 29 import qualified Data.ByteString.UTF8 as S8 30 import qualified Data.ByteString.Lazy.UTF8 as L8 31 #else 32 import qualified GHC.Foreign as GHC 33 import System.IO.Unsafe 34 import Data.ByteString.Unsafe (unsafePackMallocCStringLen) 35 import Data.Char 36 import Data.List 37 #endif 38 39 -- | A literal file path 40 type RawFilePath = S.ByteString 41 42 {- Makes all subsequent Handles that are opened, as well as stdio Handles, 43 - use the filesystem encoding, instead of the encoding of the current 44 - locale. 45 - 46 - The filesystem encoding allows "arbitrary undecodable bytes to be 47 - round-tripped through it". This avoids encoded failures when data is not 48 - encoded matching the current locale. 49 - 50 - Note that code can still use hSetEncoding to change the encoding of a 51 - Handle. This only affects the default encoding. 52 -} 53 useFileSystemEncoding :: IO () 54 useFileSystemEncoding = do 55 #ifndef mingw32_HOST_OS 56 e <- Encoding.getFileSystemEncoding 57 #else 58 {- The file system encoding does not work well on Windows, 59 - and Windows only has utf FilePaths anyway. -} 60 let e = Encoding.utf8 61 #endif 62 hSetEncoding stdin e 63 hSetEncoding stdout e 64 hSetEncoding stderr e 65 Encoding.setLocaleEncoding e 66 67 fileEncoding :: Handle -> IO () 68 #ifndef mingw32_HOST_OS 69 fileEncoding h = hSetEncoding h =<< Encoding.getFileSystemEncoding 70 #else 71 fileEncoding h = hSetEncoding h Encoding.utf8 72 #endif 73 74 {- Decodes a ByteString into a FilePath, applying the filesystem encoding. -} 75 decodeBL :: L.ByteString -> FilePath 76 #ifndef mingw32_HOST_OS 77 decodeBL = decodeBS . L.toStrict 78 #else 79 {- On Windows, we assume that the ByteString is utf-8, since Windows 80 - only uses unicode for filenames. -} 81 decodeBL = L8.toString 82 #endif 83 84 {- Encodes a FilePath into a ByteString, applying the filesystem encoding. -} 85 encodeBL :: FilePath -> L.ByteString 86 #ifndef mingw32_HOST_OS 87 encodeBL = L.fromStrict . encodeBS 88 #else 89 encodeBL = L8.fromString 90 #endif 91 92 decodeBS :: S.ByteString -> FilePath 93 #ifndef mingw32_HOST_OS 94 -- This does the same thing as System.FilePath.ByteString.decodeFilePath, 95 -- with an identical implementation. 96 {-# NOINLINE decodeBS #-} 97 decodeBS b = unsafePerformIO $ do 98 enc <- Encoding.getFileSystemEncoding 99 S.useAsCStringLen b (GHC.peekCStringLen enc) 100 #else 101 decodeBS = S8.toString 102 #endif 103 104 encodeBS :: FilePath -> S.ByteString 105 #ifndef mingw32_HOST_OS 106 -- This does the same thing as System.FilePath.ByteString.encodeFilePath, 107 -- with an identical implementation. 108 {-# NOINLINE encodeBS #-} 109 encodeBS f = unsafePerformIO $ do 110 enc <- Encoding.getFileSystemEncoding 111 GHC.newCStringLen enc f >>= unsafePackMallocCStringLen 112 #else 113 encodeBS = S8.fromString 114 #endif 115 116 fromRawFilePath :: RawFilePath -> FilePath 117 fromRawFilePath = decodeBS 118 119 toRawFilePath :: FilePath -> RawFilePath 120 toRawFilePath = encodeBS 121 122 {- Truncates a path to the given number of bytes (or less), 123 - as represented on disk. 124 - 125 - Avoids returning an invalid part of a unicode byte sequence, at the 126 - cost of efficiency when running on a large FilePath. 127 - 128 - Note that this may return ""! That can happen if it is asked to truncate 129 - to eg 1 byte, but the input path starts with a unicode byte sequence. 130 -} 131 truncateFilePath :: Int -> RawFilePath -> RawFilePath 132 #ifndef mingw32_HOST_OS 133 {- On unix, do not assume a unicode locale, but does assume ascii 134 - characters are a single byte. -} 135 truncateFilePath n b = 136 let blen = S.length b 137 in if blen <= n 138 then b 139 else go blen (reverse (fromRawFilePath b)) 140 where 141 go blen f = case uncons f of 142 Just (c, f') 143 | isAscii c -> 144 let blen' = blen - 1 145 in if blen' <= n 146 then toRawFilePath (reverse f') 147 else go blen' f' 148 | otherwise -> 149 let blen' = S.length (toRawFilePath f') 150 in if blen' <= n 151 then toRawFilePath (reverse f') 152 else go blen' f' 153 Nothing -> toRawFilePath (reverse f) 154 #else 155 {- On Windows, count the number of bytes used by each utf8 character. -} 156 truncateFilePath n = toRawFilePath . reverse . go [] n 157 where 158 go coll cnt bs 159 | cnt <= 0 = coll 160 | otherwise = case S8.decode bs of 161 Just (c, x) 162 | c /= S8.replacement_char -> 163 let x' = fromIntegral x 164 in if cnt - x' < 0 165 then coll 166 else go (c:coll) (cnt - x') (S8.drop 1 bs) 167 | otherwise -> 168 go ('_':coll) (cnt - 1) (S8.drop 1 bs) 169 _ -> coll 170 #endif