spotify · dlstadther · Jul 13, 2017 · Jun 30, 2017 · Jun 30, 2017 · Jul 3, 2017
@@ -34,11 +34,15 @@
     import pickle
 import logging
 import sys
+import os
 
 
 class PySparkRunner(object):
 
     def __init__(self, job, *args):
+        # Append job directory to PYTHON_PATH to enable dynamic import
+        # of the module in which the class resides on unpickling
+        sys.path.append(os.path.dirname(job))
         with open(job, "rb") as fd:
             self.job = pickle.load(fd)
         self.args = args

@@ -22,6 +22,7 @@
 import shutil
 import importlib
 import tarfile
+import inspect
 try:
     import cPickle as pickle
 except ImportError:
@@ -278,6 +279,9 @@ def run(self):
         self.run_path = tempfile.mkdtemp(prefix=self.name)
         self.run_pickle = os.path.join(self.run_path, '.'.join([self.name.replace(' ', '_'), 'pickle']))
         with open(self.run_pickle, 'wb') as fd:
+            # Copy module file to run path.
+            module_path = os.path.abspath(inspect.getfile(self.__class__))
+            shutil.copy(module_path, os.path.join(self.run_path, '.'))
             self._dump(fd)
         try:
             super(PySparkTask, self).run()
@@ -289,7 +293,7 @@ def _dump(self, fd):
             if self.__module__ == '__main__':
                 d = pickle.dumps(self)
                 module_name = os.path.basename(sys.argv[0]).rsplit('.', 1)[0]
-                d = d.replace(b'(c__main__', "(c" + module_name)
+                d = d.replace(b'c__main__', b'c' + module_name.encode('ascii'))
                 fd.write(d)
             else:
                 pickle.dump(self, fd)

@@ -17,6 +17,8 @@
 
 import unittest
 import os
+import sys
+import pickle
 import luigi
 import luigi.contrib.hdfs
 from luigi import six
@@ -219,6 +221,14 @@ def mock_spark_submit(task):
             PySparkRunner(*task.app_command()[1:]).run()
             # Check py-package exists
             self.assertTrue(os.path.exists(sc.addPyFile.call_args[0][0]))
+            # Check that main module containing the task exists.
+            run_path = os.path.dirname(task.app_command()[1])
+            self.assertTrue(os.path.exists(os.path.join(run_path, os.path.basename(__file__))))
+            # Check that the python path contains the run_path
+            self.assertTrue(run_path in sys.path)
+            # Check if find_class finds the class for the correct module name.
+            with open(task.app_command()[1], 'rb') as fp:
+                self.assertTrue(pickle.Unpickler(fp).find_class('spark_test', 'TestPySparkTask'))
 
         with patch.object(SparkSubmitTask, 'run', mock_spark_submit):
             job = TestPySparkTask()