/ nix / script / exe / unshare-gpu.hs
unshare-gpu.hs
  1  {-# LANGUAGE OverloadedStrings #-}
  2  
  3  {- |
  4  oci-gpu: Run OCI container images with NVIDIA GPU access
  5  
  6  Uses Aleph.Script - batteries-included shell scripting for Haskell.
  7  Much cleaner than raw Turtle or Shelly!
  8  -}
  9  module Main where
 10  
 11  import Aleph.Script
 12  import qualified Data.Aeson.KeyMap as KM
 13  import qualified Data.ByteString.Lazy as LBS
 14  import Data.Foldable (toList)
 15  import qualified Data.Text as T
 16  import qualified System.Environment as Env
 17  import System.Posix.Process (executeFile)
 18  import Prelude hiding (FilePath)
 19  
 20  -- ============================================================================
 21  -- Types
 22  -- ============================================================================
 23  
 24  data Config = Config
 25      { cfgImage :: Text
 26      , cfgCommand :: [Text]
 27      , cfgPlatform :: Text
 28      , cfgCacheDir :: FilePath
 29      , cfgCertFile :: FilePath
 30      }
 31  
 32  data ContainerEnv = ContainerEnv
 33      { cenvPath :: Maybe Text
 34      , cenvLdLibPath :: Maybe Text
 35      }
 36  
 37  emptyEnv :: ContainerEnv
 38  emptyEnv = ContainerEnv Nothing Nothing
 39  
 40  -- ============================================================================
 41  -- Main
 42  -- ============================================================================
 43  
 44  main :: IO ()
 45  main = script $ do
 46      args <- liftIO Env.getArgs
 47      case args of
 48          [] -> die "Usage: oci-gpu IMAGE [COMMAND...]"
 49          (img : cmdArgs) -> do
 50              let cmd' = if Prelude.null cmdArgs then ["nvidia-smi"] else Prelude.map pack cmdArgs
 51              cfg <- buildConfig (pack img) cmd'
 52              runWithGpu cfg
 53  
 54  buildConfig :: Text -> [Text] -> Sh Config
 55  buildConfig img cmd' = do
 56      homeDir <- getEnvDefault "HOME" "/tmp"
 57      xdgCache <- getEnv "XDG_CACHE_HOME"
 58      let cacheBase = fromText $ fromMaybe (homeDir <> "/.cache") xdgCache
 59  
 60      pure
 61          Config
 62              { cfgImage = img
 63              , cfgCommand = cmd'
 64              , cfgPlatform = "linux/amd64"
 65              , cfgCacheDir = cacheBase </> "straylight-oci"
 66              , cfgCertFile = "/etc/ssl/certs/ca-bundle.crt"
 67              }
 68  
 69  -- ============================================================================
 70  -- Core Logic
 71  -- ============================================================================
 72  
 73  runWithGpu :: Config -> Sh ()
 74  runWithGpu cfg = do
 75      mkdirP (cfgCacheDir cfg)
 76  
 77      cacheKey <- getCacheKey (cfgImage cfg)
 78      let cachedRootfs = cfgCacheDir cfg </> unpack cacheKey
 79  
 80      withTmpDir $ \workDir -> do
 81          let rootfsLink = workDir </> "rootfs"
 82  
 83          cached <- test_d cachedRootfs
 84          if cached
 85              then do
 86                  echoErr $ ":: Using cached " <> cfgImage cfg
 87                  symlink cachedRootfs rootfsLink
 88              else do
 89                  echoErr $ ":: Pulling " <> cfgImage cfg
 90                  pullImage cfg workDir
 91                  mv (workDir </> "rootfs") cachedRootfs
 92                  symlink cachedRootfs rootfsLink
 93                  echoErr $ ":: Cached to " <> pack cachedRootfs
 94  
 95          mkdirP (rootfsLink </> "usr/local/nvidia/bin")
 96          mkdirP (rootfsLink </> "usr/local/nvidia/lib64")
 97  
 98          nvBinds <- withGpuBinds
 99          containerEnv <- getContainerEnv cfg
100  
101          let combinedPath = buildPath containerEnv
102              combinedLdPath = buildLdPath containerEnv
103  
104          echoErr ":: Entering namespace with GPU"
105          let bwrapArgs = buildBwrapArgs workDir nvBinds combinedPath combinedLdPath (cfgCommand cfg)
106  
107          liftIO $ executeFile "bwrap" True (Prelude.map unpack bwrapArgs) Nothing
108  
109  -- ============================================================================
110  -- Image Operations
111  -- ============================================================================
112  
113  getCacheKey :: Text -> Sh Text
114  getCacheKey img = do
115      result <- bash $ "echo -n '" <> img <> "' | sha256sum | cut -c1-16"
116      pure $ strip result
117  
118  pullImage :: Config -> FilePath -> Sh ()
119  pullImage cfg workDir = do
120      let rootfs = workDir </> "rootfs"
121      mkdirP rootfs
122      setEnv "SSL_CERT_FILE" (pack $ cfgCertFile cfg)
123      bash_ $
124          "crane export --platform "
125              <> cfgPlatform cfg
126              <> " '"
127              <> cfgImage cfg
128              <> "' - | tar -xf - -C "
129              <> pack rootfs
130  
131  getContainerEnv :: Config -> Sh ContainerEnv
132  getContainerEnv cfg = do
133      result <- errExit False $ run "crane" ["config", cfgImage cfg]
134      code <- exitCode
135      if code == 0
136          then pure $ parseEnvFromJson result
137          else pure emptyEnv
138    where
139      parseEnvFromJson :: Text -> ContainerEnv
140      parseEnvFromJson json =
141          case decode (LBS.fromStrict $ encodeUtf8 json) :: Maybe Value of
142              Just val -> extractEnv val
143              Nothing -> emptyEnv
144  
145      extractEnv :: Value -> ContainerEnv
146      extractEnv val = fromMaybe emptyEnv $ do
147          Object root <- Just val
148          Object cfgObj <- KM.lookup "config" root
149          Array envArr <- KM.lookup "Env" cfgObj
150          let envPairs = Prelude.map extractEnvVar (toList envArr)
151          Just
152              ContainerEnv
153                  { cenvPath = Prelude.lookup "PATH" envPairs
154                  , cenvLdLibPath = Prelude.lookup "LD_LIBRARY_PATH" envPairs
155                  }
156  
157      extractEnvVar (String str) = let (k, v) = breakOn "=" str in (k, T.drop 1 v)
158      extractEnvVar _ = ("", "")
159  
160  -- ============================================================================
161  -- Environment Building
162  -- ============================================================================
163  
164  buildPath :: ContainerEnv -> Text
165  buildPath env =
166      "/usr/local/nvidia/bin:" <> fromMaybe defaultPath (cenvPath env)
167    where
168      defaultPath = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
169  
170  buildLdPath :: ContainerEnv -> Text
171  buildLdPath env =
172      "/usr/local/nvidia/lib64:/run/opengl-driver/lib"
173          <> maybe "" (":" <>) (cenvLdLibPath env)
174  
175  -- ============================================================================
176  -- Bwrap Execution
177  -- ============================================================================
178  
179  buildBwrapArgs :: FilePath -> [Text] -> Text -> Text -> [Text] -> [Text]
180  buildBwrapArgs workDir nvBinds path ldPath cmd' =
181      [ "--bind"
182      , pack (workDir </> "rootfs")
183      , "/"
184      , "--dev"
185      , "/dev"
186      , "--proc"
187      , "/proc"
188      , "--ro-bind"
189      , "/sys"
190      , "/sys"
191      , "--tmpfs"
192      , "/tmp"
193      , "--tmpfs"
194      , "/run"
195      ]
196          <> nvBinds
197          <> [ "--ro-bind"
198             , "/etc/resolv.conf"
199             , "/etc/resolv.conf"
200             , "--ro-bind"
201             , "/etc/ssl"
202             , "/etc/ssl"
203             , "--setenv"
204             , "PATH"
205             , path
206             , "--setenv"
207             , "HOME"
208             , "/root"
209             , "--setenv"
210             , "LD_LIBRARY_PATH"
211             , ldPath
212             , "--setenv"
213             , "OPAL_PREFIX"
214             , "/opt/hpcx/ompi"
215             , "--setenv"
216             , "OMPI_MCA_btl"
217             , "^openib"
218             , "--chdir"
219             , "/root"
220             , "--die-with-parent"
221             , "--unshare-pid"
222             , "--"
223             ]
224          <> cmd'