/ Utility / FileSystemEncoding.hs
FileSystemEncoding.hs
  1  {- GHC File system encoding handling.
  2   -
  3   - Copyright 2012-2021 Joey Hess <id@joeyh.name>
  4   -
  5   - License: BSD-2-clause
  6   -}
  7  
  8  {-# LANGUAGE CPP #-}
  9  {-# OPTIONS_GHC -fno-warn-tabs #-}
 10  
 11  module Utility.FileSystemEncoding (
 12  	useFileSystemEncoding,
 13  	fileEncoding,
 14  	RawFilePath,
 15  	fromRawFilePath,
 16  	toRawFilePath,
 17  	decodeBL,
 18  	encodeBL,
 19  	decodeBS,
 20  	encodeBS,
 21  	truncateFilePath,
 22  ) where
 23  
 24  import qualified GHC.IO.Encoding as Encoding
 25  import System.IO
 26  import qualified Data.ByteString as S
 27  import qualified Data.ByteString.Lazy as L
 28  #ifdef mingw32_HOST_OS
 29  import qualified Data.ByteString.UTF8 as S8
 30  import qualified Data.ByteString.Lazy.UTF8 as L8
 31  #else
 32  import qualified GHC.Foreign as GHC
 33  import System.IO.Unsafe
 34  import Data.ByteString.Unsafe (unsafePackMallocCStringLen)
 35  import Data.Char
 36  import Data.List
 37  #endif
 38  
 39  -- | A literal file path
 40  type RawFilePath = S.ByteString
 41  
 42  {- Makes all subsequent Handles that are opened, as well as stdio Handles,
 43   - use the filesystem encoding, instead of the encoding of the current
 44   - locale.
 45   -
 46   - The filesystem encoding allows "arbitrary undecodable bytes to be
 47   - round-tripped through it". This avoids encoded failures when data is not
 48   - encoded matching the current locale.
 49   -
 50   - Note that code can still use hSetEncoding to change the encoding of a
 51   - Handle. This only affects the default encoding.
 52   -}
 53  useFileSystemEncoding :: IO ()
 54  useFileSystemEncoding = do
 55  #ifndef mingw32_HOST_OS
 56  	e <- Encoding.getFileSystemEncoding
 57  #else
 58  	{- The file system encoding does not work well on Windows,
 59  	 - and Windows only has utf FilePaths anyway. -}
 60  	let e = Encoding.utf8
 61  #endif
 62  	hSetEncoding stdin e
 63  	hSetEncoding stdout e
 64  	hSetEncoding stderr e
 65  	Encoding.setLocaleEncoding e	
 66  
 67  fileEncoding :: Handle -> IO ()
 68  #ifndef mingw32_HOST_OS
 69  fileEncoding h = hSetEncoding h =<< Encoding.getFileSystemEncoding
 70  #else
 71  fileEncoding h = hSetEncoding h Encoding.utf8
 72  #endif
 73  
 74  {- Decodes a ByteString into a FilePath, applying the filesystem encoding. -}
 75  decodeBL :: L.ByteString -> FilePath
 76  #ifndef mingw32_HOST_OS
 77  decodeBL = decodeBS . L.toStrict
 78  #else
 79  {- On Windows, we assume that the ByteString is utf-8, since Windows
 80   - only uses unicode for filenames. -}
 81  decodeBL = L8.toString
 82  #endif
 83  
 84  {- Encodes a FilePath into a ByteString, applying the filesystem encoding. -}
 85  encodeBL :: FilePath -> L.ByteString
 86  #ifndef mingw32_HOST_OS
 87  encodeBL = L.fromStrict . encodeBS
 88  #else
 89  encodeBL = L8.fromString
 90  #endif
 91  
 92  decodeBS :: S.ByteString -> FilePath
 93  #ifndef mingw32_HOST_OS
 94  -- This does the same thing as System.FilePath.ByteString.decodeFilePath,
 95  -- with an identical implementation.
 96  {-# NOINLINE decodeBS #-}
 97  decodeBS b = unsafePerformIO $ do
 98  	enc <- Encoding.getFileSystemEncoding
 99  	S.useAsCStringLen b (GHC.peekCStringLen enc)
100  #else
101  decodeBS = S8.toString
102  #endif
103  
104  encodeBS :: FilePath -> S.ByteString
105  #ifndef mingw32_HOST_OS
106  -- This does the same thing as System.FilePath.ByteString.encodeFilePath,
107  -- with an identical implementation.
108  {-# NOINLINE encodeBS #-}
109  encodeBS f = unsafePerformIO $ do
110  	enc <- Encoding.getFileSystemEncoding
111  	GHC.newCStringLen enc f >>= unsafePackMallocCStringLen
112  #else
113  encodeBS = S8.fromString
114  #endif
115  
116  fromRawFilePath :: RawFilePath -> FilePath
117  fromRawFilePath = decodeBS
118  
119  toRawFilePath :: FilePath -> RawFilePath
120  toRawFilePath = encodeBS
121  
122  {- Truncates a path to the given number of bytes (or less),
123   - as represented on disk.
124   -
125   - Avoids returning an invalid part of a unicode byte sequence, at the
126   - cost of efficiency when running on a large FilePath.
127   -
128   - Note that this may return ""! That can happen if it is asked to truncate
129   - to eg 1 byte, but the input path starts with a unicode byte sequence.
130   -}
131  truncateFilePath :: Int -> RawFilePath -> RawFilePath
132  #ifndef mingw32_HOST_OS
133  {- On unix, do not assume a unicode locale, but does assume ascii
134   - characters are a single byte. -}
135  truncateFilePath n b = 
136  	let blen = S.length b
137  	in if blen <= n
138  		then b
139  		else go blen (reverse (fromRawFilePath b))
140    where
141  	go blen f = case uncons f of
142  		Just (c, f')
143  			| isAscii c ->
144  				let blen' = blen - 1
145  				in if blen' <= n
146  					then toRawFilePath (reverse f')
147  					else go blen' f'
148  			| otherwise ->
149  				let blen' = S.length (toRawFilePath f')
150  				in if blen' <= n 
151  					then toRawFilePath (reverse f')
152  					else go blen' f'
153  		Nothing -> toRawFilePath (reverse f)
154  #else
155  {- On Windows, count the number of bytes used by each utf8 character. -}
156  truncateFilePath n = toRawFilePath . reverse . go [] n
157    where
158  	go coll cnt bs
159  		| cnt <= 0 = coll
160  		| otherwise = case S8.decode bs of
161  			Just (c, x)
162  				| c /= S8.replacement_char ->
163  					let x' = fromIntegral x
164  					in if cnt - x' < 0
165  						then coll
166  						else go (c:coll) (cnt - x') (S8.drop 1 bs)
167  				| otherwise ->
168  					go ('_':coll) (cnt - 1) (S8.drop 1 bs)
169  			_ -> coll
170  #endif