unshare-gpu.hs
1 {-# LANGUAGE OverloadedStrings #-} 2 3 {- | 4 oci-gpu: Run OCI container images with NVIDIA GPU access 5 6 Uses Aleph.Script - batteries-included shell scripting for Haskell. 7 Much cleaner than raw Turtle or Shelly! 8 -} 9 module Main where 10 11 import Aleph.Script 12 import qualified Data.Aeson.KeyMap as KM 13 import qualified Data.ByteString.Lazy as LBS 14 import Data.Foldable (toList) 15 import qualified Data.Text as T 16 import qualified System.Environment as Env 17 import System.Posix.Process (executeFile) 18 import Prelude hiding (FilePath) 19 20 -- ============================================================================ 21 -- Types 22 -- ============================================================================ 23 24 data Config = Config 25 { cfgImage :: Text 26 , cfgCommand :: [Text] 27 , cfgPlatform :: Text 28 , cfgCacheDir :: FilePath 29 , cfgCertFile :: FilePath 30 } 31 32 data ContainerEnv = ContainerEnv 33 { cenvPath :: Maybe Text 34 , cenvLdLibPath :: Maybe Text 35 } 36 37 emptyEnv :: ContainerEnv 38 emptyEnv = ContainerEnv Nothing Nothing 39 40 -- ============================================================================ 41 -- Main 42 -- ============================================================================ 43 44 main :: IO () 45 main = script $ do 46 args <- liftIO Env.getArgs 47 case args of 48 [] -> die "Usage: oci-gpu IMAGE [COMMAND...]" 49 (img : cmdArgs) -> do 50 let cmd' = if Prelude.null cmdArgs then ["nvidia-smi"] else Prelude.map pack cmdArgs 51 cfg <- buildConfig (pack img) cmd' 52 runWithGpu cfg 53 54 buildConfig :: Text -> [Text] -> Sh Config 55 buildConfig img cmd' = do 56 homeDir <- getEnvDefault "HOME" "/tmp" 57 xdgCache <- getEnv "XDG_CACHE_HOME" 58 let cacheBase = fromText $ fromMaybe (homeDir <> "/.cache") xdgCache 59 60 pure 61 Config 62 { cfgImage = img 63 , cfgCommand = cmd' 64 , cfgPlatform = "linux/amd64" 65 , cfgCacheDir = cacheBase </> "straylight-oci" 66 , cfgCertFile = "/etc/ssl/certs/ca-bundle.crt" 67 } 68 69 -- ============================================================================ 70 -- Core Logic 71 -- ============================================================================ 72 73 runWithGpu :: Config -> Sh () 74 runWithGpu cfg = do 75 mkdirP (cfgCacheDir cfg) 76 77 cacheKey <- getCacheKey (cfgImage cfg) 78 let cachedRootfs = cfgCacheDir cfg </> unpack cacheKey 79 80 withTmpDir $ \workDir -> do 81 let rootfsLink = workDir </> "rootfs" 82 83 cached <- test_d cachedRootfs 84 if cached 85 then do 86 echoErr $ ":: Using cached " <> cfgImage cfg 87 symlink cachedRootfs rootfsLink 88 else do 89 echoErr $ ":: Pulling " <> cfgImage cfg 90 pullImage cfg workDir 91 mv (workDir </> "rootfs") cachedRootfs 92 symlink cachedRootfs rootfsLink 93 echoErr $ ":: Cached to " <> pack cachedRootfs 94 95 mkdirP (rootfsLink </> "usr/local/nvidia/bin") 96 mkdirP (rootfsLink </> "usr/local/nvidia/lib64") 97 98 nvBinds <- withGpuBinds 99 containerEnv <- getContainerEnv cfg 100 101 let combinedPath = buildPath containerEnv 102 combinedLdPath = buildLdPath containerEnv 103 104 echoErr ":: Entering namespace with GPU" 105 let bwrapArgs = buildBwrapArgs workDir nvBinds combinedPath combinedLdPath (cfgCommand cfg) 106 107 liftIO $ executeFile "bwrap" True (Prelude.map unpack bwrapArgs) Nothing 108 109 -- ============================================================================ 110 -- Image Operations 111 -- ============================================================================ 112 113 getCacheKey :: Text -> Sh Text 114 getCacheKey img = do 115 result <- bash $ "echo -n '" <> img <> "' | sha256sum | cut -c1-16" 116 pure $ strip result 117 118 pullImage :: Config -> FilePath -> Sh () 119 pullImage cfg workDir = do 120 let rootfs = workDir </> "rootfs" 121 mkdirP rootfs 122 setEnv "SSL_CERT_FILE" (pack $ cfgCertFile cfg) 123 bash_ $ 124 "crane export --platform " 125 <> cfgPlatform cfg 126 <> " '" 127 <> cfgImage cfg 128 <> "' - | tar -xf - -C " 129 <> pack rootfs 130 131 getContainerEnv :: Config -> Sh ContainerEnv 132 getContainerEnv cfg = do 133 result <- errExit False $ run "crane" ["config", cfgImage cfg] 134 code <- exitCode 135 if code == 0 136 then pure $ parseEnvFromJson result 137 else pure emptyEnv 138 where 139 parseEnvFromJson :: Text -> ContainerEnv 140 parseEnvFromJson json = 141 case decode (LBS.fromStrict $ encodeUtf8 json) :: Maybe Value of 142 Just val -> extractEnv val 143 Nothing -> emptyEnv 144 145 extractEnv :: Value -> ContainerEnv 146 extractEnv val = fromMaybe emptyEnv $ do 147 Object root <- Just val 148 Object cfgObj <- KM.lookup "config" root 149 Array envArr <- KM.lookup "Env" cfgObj 150 let envPairs = Prelude.map extractEnvVar (toList envArr) 151 Just 152 ContainerEnv 153 { cenvPath = Prelude.lookup "PATH" envPairs 154 , cenvLdLibPath = Prelude.lookup "LD_LIBRARY_PATH" envPairs 155 } 156 157 extractEnvVar (String str) = let (k, v) = breakOn "=" str in (k, T.drop 1 v) 158 extractEnvVar _ = ("", "") 159 160 -- ============================================================================ 161 -- Environment Building 162 -- ============================================================================ 163 164 buildPath :: ContainerEnv -> Text 165 buildPath env = 166 "/usr/local/nvidia/bin:" <> fromMaybe defaultPath (cenvPath env) 167 where 168 defaultPath = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" 169 170 buildLdPath :: ContainerEnv -> Text 171 buildLdPath env = 172 "/usr/local/nvidia/lib64:/run/opengl-driver/lib" 173 <> maybe "" (":" <>) (cenvLdLibPath env) 174 175 -- ============================================================================ 176 -- Bwrap Execution 177 -- ============================================================================ 178 179 buildBwrapArgs :: FilePath -> [Text] -> Text -> Text -> [Text] -> [Text] 180 buildBwrapArgs workDir nvBinds path ldPath cmd' = 181 [ "--bind" 182 , pack (workDir </> "rootfs") 183 , "/" 184 , "--dev" 185 , "/dev" 186 , "--proc" 187 , "/proc" 188 , "--ro-bind" 189 , "/sys" 190 , "/sys" 191 , "--tmpfs" 192 , "/tmp" 193 , "--tmpfs" 194 , "/run" 195 ] 196 <> nvBinds 197 <> [ "--ro-bind" 198 , "/etc/resolv.conf" 199 , "/etc/resolv.conf" 200 , "--ro-bind" 201 , "/etc/ssl" 202 , "/etc/ssl" 203 , "--setenv" 204 , "PATH" 205 , path 206 , "--setenv" 207 , "HOME" 208 , "/root" 209 , "--setenv" 210 , "LD_LIBRARY_PATH" 211 , ldPath 212 , "--setenv" 213 , "OPAL_PREFIX" 214 , "/opt/hpcx/ompi" 215 , "--setenv" 216 , "OMPI_MCA_btl" 217 , "^openib" 218 , "--chdir" 219 , "/root" 220 , "--die-with-parent" 221 , "--unshare-pid" 222 , "--" 223 ] 224 <> cmd'