/ test / python / testpipeline / testtext / testtranslation.py
testtranslation.py
  1  """
  2  Translation module tests
  3  """
  4  
  5  import unittest
  6  import time
  7  
  8  import requests
  9  
 10  from txtai.pipeline import Translation
 11  
 12  
 13  class TestTranslation(unittest.TestCase):
 14      """
 15      Translation tests.
 16      """
 17  
 18      @classmethod
 19      def setUpClass(cls):
 20          """
 21          Create single translation instance.
 22          """
 23  
 24          cls.translate = Translation()
 25  
 26          # Preload list of models. Handle HF Hub errors.
 27          complete, wait = False, 1
 28          while not complete:
 29              try:
 30                  cls.translate.lookup("en", "es")
 31                  complete = True
 32              except requests.exceptions.HTTPError:
 33                  # Exponential backoff
 34                  time.sleep(wait)
 35  
 36                  # Wait up to 16 seconds
 37                  wait = min(wait * 2, 16)
 38  
 39      def testDetect(self):
 40          """
 41          Test language detection
 42          """
 43  
 44          test = ["This is a test language detection."]
 45          language = self.translate.detect(test)
 46  
 47          self.assertListEqual(language, ["en"])
 48  
 49      def testDetectWithCustomFunc(self):
 50          """
 51          Test language detection with custom function
 52          """
 53  
 54          def dummy_func(text):
 55              return ["en" for x in text]
 56  
 57          translate = Translation(langdetect=dummy_func)
 58  
 59          test = ["This is a test language detection."]
 60          language = translate.detect(test)
 61  
 62          self.assertListEqual(language, ["en"])
 63  
 64      def testLongTranslation(self):
 65          """
 66          Test a translation longer than max tokenization length
 67          """
 68  
 69          text = "This is a test translation to Spanish. " * 100
 70          translation = self.translate(text, "es")
 71  
 72          # Validate translation text
 73          self.assertIsNotNone(translation)
 74  
 75      def testM2M100Translation(self):
 76          """
 77          Test a translation using M2M100 models
 78          """
 79  
 80          text = self.translate("This is a test translation to Croatian", "hr")
 81  
 82          # Validate translation text
 83          self.assertEqual(text, "Ovo je testni prijevod na hrvatski")
 84  
 85      def testMarianTranslation(self):
 86          """
 87          Test a translation using Marian models
 88          """
 89  
 90          text = "This is a test translation into Spanish"
 91          translation = self.translate(text, "es")
 92  
 93          # Validate translation text
 94          self.assertEqual(translation, "Esta es una traducción de prueba al español")
 95  
 96          # Validate translation back
 97          translation = self.translate(translation, "en")
 98          self.assertEqual(translation, text)
 99  
100      def testNoLang(self):
101          """
102          Test no matching language id
103          """
104  
105          self.assertIsNone(self.translate.langid([], "zz"))
106  
107      def testNoModel(self):
108          """
109          Test no known available model found
110          """
111  
112          self.assertEqual(self.translate.modelpath("zz", "en"), "Helsinki-NLP/opus-mt-mul-en")
113  
114      def testNoTranslation(self):
115          """
116          Test translation skipped when text already in destination language
117          """
118  
119          text = "This is a test translation to English"
120          translation = self.translate(text, "en")
121  
122          # Validate no translation
123          self.assertEqual(text, translation)
124  
125      def testShowmodelsChunked(self):
126          """
127          Test a long translation with showmodels flag. When text is chunked
128          by the tokenizer, results should still be properly concatenated as
129          a 3-tuple (translation, language, model) rather than a malformed tuple.
130          """
131  
132          text = "This is a test translation to Spanish. " * 100
133          result = self.translate(text, "es", showmodels=True)
134  
135          # Result should be a tuple of exactly 3 elements
136          self.assertIsInstance(result, tuple)
137          self.assertEqual(len(result), 3)
138  
139          translation, language, modelpath = result
140  
141          # Translation should be a single string, not a nested tuple
142          self.assertIsInstance(translation, str)
143          self.assertIsNotNone(translation)
144          self.assertGreater(len(translation), 0)
145  
146          # Language and model should be valid strings
147          self.assertEqual(language, "en")
148          self.assertIsInstance(modelpath, str)
149  
150      def testTranslationWithShowmodels(self):
151          """
152          Test a translation using Marian models and showmodels flag to return
153          model and language.
154          """
155  
156          text = "This is a test translation into Spanish"
157          result = self.translate(text, "es", showmodels=True)
158  
159          translation, language, modelpath = result
160          # Validate translation text
161          self.assertEqual(translation, "Esta es una traducción de prueba al español")
162          # Validate detected language
163          self.assertEqual(language, "en")
164          # Validate model
165          self.assertEqual(modelpath, "Helsinki-NLP/opus-mt-en-es")
166  
167          # Validate translation back
168          result = self.translate(translation, "en", showmodels=True)
169  
170          translation, language, modelpath = result
171          self.assertEqual(translation, text)
172          # Validate detected language
173          self.assertEqual(language, "es")
174          # Validate model
175          self.assertEqual(modelpath, "Helsinki-NLP/opus-mt-es-en")