Implement ar-file parsing in python (#8681)

This takes a different approach to the problem of duplicate files in ar archives. This makes the emar wrapper unnecessary solving the problem at ar file extraction time instead.
emscripten-core · Jun 3, 2019 · 1d198e4 · 1d198e4
1 parent ee231b0
commit 1d198e4
Show file tree

Hide file tree

Showing 5 changed files with 224 additions and 180 deletions.
diff --git a/emar.py b/emar.py
@@ -6,98 +6,19 @@
 
 """Archive helper script
 
-This script acts as a frontend replacement for `ar`. See emcc.
-This is needed because, unlike a traditional linker, emscripten can't handle
-archive with duplicate member names.  This is because emscripten extracts
-archive to a temporary location and duplicate filenames will clobber each
-other in this case.
+This script is a simple wrapper around llvm-ar.  It used to have special
+handling for duplicate basenames in order to allow bitcode linking process to
+read such files.  This is now handled by using tools/arfile.py to read archives.
 """
 
-# TODO(sbc): Implement `ar x` within emscripten, in python, to avoid this issue
-# and delete this file.
-
-from __future__ import print_function
-import hashlib
-import os
-import shutil
 import sys
 
-from tools.toolchain_profiler import ToolchainProfiler
 from tools import shared
-from tools.response_file import substitute_response_files, create_response_file
-
-if __name__ == '__main__':
-  ToolchainProfiler.record_process_start()
 
 
-#
-# Main run() function
-#
 def run():
-  args = substitute_response_files(sys.argv)
-  newargs = [shared.LLVM_AR] + args[1:]
-
-  to_delete = []
-
-  # The 3 argmuent form of ar doesn't involve other files. For example
-  # 'ar x libfoo.a'.
-  if len(newargs) > 3:
-    cmd = newargs[1]
-    if 'r' in cmd:
-      # We are adding files to the archive.
-      # Normally the output file is then arg 2, except in the case were the
-      # a or b modifiers are used in which case its arg 3.
-      if 'a' in cmd or 'b' in cmd:
-        out_arg_index = 3
-      else:
-        out_arg_index = 2
-
-      contents = set()
-      if os.path.exists(newargs[out_arg_index]):
-        cmd = [shared.LLVM_AR, 't', newargs[out_arg_index]]
-        output = shared.check_call(cmd, stdout=shared.PIPE).stdout
-        contents.update(output.split('\n'))
-
-      # Add a hash to colliding basename, to make them unique.
-      for j in range(out_arg_index + 1, len(newargs)):
-        orig_name = newargs[j]
-        full_name = os.path.abspath(orig_name)
-        dirname = os.path.dirname(full_name)
-        basename = os.path.basename(full_name)
-        if basename not in contents:
-          contents.add(basename)
-          continue
-        h = hashlib.md5(full_name.encode('utf-8')).hexdigest()[:8]
-        parts = basename.split('.')
-        parts[0] += '_' + h
-        newname = '.'.join(parts)
-        full_newname = os.path.join(dirname, newname)
-        assert not os.path.exists(full_newname)
-        try:
-          shutil.copyfile(orig_name, full_newname)
-          newargs[j] = full_newname
-          to_delete.append(full_newname)
-          contents.add(newname)
-        except:
-          # it is ok to fail here, we just don't get hashing
-          contents.add(basename)
-          pass
-
-    if shared.DEBUG:
-      print('emar:', sys.argv, '  ==>  ', newargs, file=sys.stderr)
-
-    response_filename = create_response_file(newargs[3:], shared.get_emscripten_temp_dir())
-    to_delete += [response_filename]
-    newargs = newargs[:3] + ['@' + response_filename]
-
-  if shared.DEBUG:
-    print('emar:', sys.argv, '  ==>  ', newargs, file=sys.stderr)
-
-  try:
-    return shared.run_process(newargs, stdin=sys.stdin, check=False).returncode
-  finally:
-    for d in to_delete:
-      shared.try_delete(d)
+  newargs = [shared.LLVM_AR] + sys.argv[1:]
+  return shared.run_process(newargs, stdin=sys.stdin, check=False).returncode
 
 
 if __name__ == '__main__':

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -8,7 +8,6 @@
 import hashlib
 import json
 import os
-import random
 import re
 import shutil
 import sys
@@ -5164,26 +5163,25 @@ def test_iostream_and_determinism(self):
         return 0;
       }
     '''
-    num = 5
+    num = 3
 
     def test():
       print('(iteration)')
-      time.sleep(random.random() / (10 * num)) # add some timing nondeterminism here, not that we need it, but whatever
+      time.sleep(1.0)
       self.do_run(src, 'hello world\n77.\n')
       ret = open('src.cpp.o.js', 'rb').read()
       if self.get_setting('WASM') and not self.get_setting('WASM2JS'):
         ret += open('src.cpp.o.wasm', 'rb').read()
       return ret
 
     builds = [test() for i in range(num)]
-    print(list(map(len, builds)))
+    print([len(b) for b in builds])
     uniques = set(builds)
     if len(uniques) != 1:
-      i = 0
-      for unique in uniques:
+      for i, unique in enumerate(uniques):
         open('unique_' + str(i) + '.js', 'wb').write(unique)
-        i += 1
-      assert 0, 'builds must be deterministic, see unique_X.js'
+      # builds must be deterministic, see unique_N.js
+      self.assertEqual(len(uniques), 1)
 
   def test_stdvec(self):
     self.do_run_in_out_file_test('tests', 'core', 'test_stdvec')

diff --git a/tests/test_other.py b/tests/test_other.py
@@ -1450,16 +1450,10 @@ def test_archive_duplicate_basenames(self):
     ''')
     run_process([PYTHON, EMCC, os.path.join('b', 'common.c'), '-c', '-o', os.path.join('b', 'common.o')])
 
-    try_delete('liba.a')
-    run_process([PYTHON, EMAR, 'rc', 'liba.a', os.path.join('a', 'common.o'), os.path.join('b', 'common.o')])
-
-    # Verify that archive contains basenames with hashes to avoid duplication
-    text = run_process([PYTHON, EMAR, 't', 'liba.a'], stdout=PIPE).stdout
-    self.assertEqual(text.count('common.o'), 1)
-    self.assertContained('common_', text)
-    for line in text.split('\n'):
-      # should not have huge hash names
-      self.assertLess(len(line), 20, line)
+    try_delete('libdup.a')
+    run_process([PYTHON, EMAR, 'rc', 'libdup.a', os.path.join('a', 'common.o'), os.path.join('b', 'common.o')])
+    text = run_process([PYTHON, EMAR, 't', 'libdup.a'], stdout=PIPE).stdout
+    self.assertEqual(text.count('common.o'), 2)
 
     create_test_file('main.c', r'''
       void a(void);
@@ -1469,30 +1463,9 @@ def test_archive_duplicate_basenames(self):
         b();
       }
     ''')
-    err = run_process([PYTHON, EMCC, 'main.c', '-L.', '-la'], stderr=PIPE).stderr
-    self.assertNotIn('archive file contains duplicate entries', err)
+    run_process([PYTHON, EMCC, 'main.c', '-L.', '-ldup'])
     self.assertContained('a\nb...\n', run_js('a.out.js'))
 
-    # Using llvm-ar directly should cause duplicate basenames
-    try_delete('libdup.a')
-    run_process([LLVM_AR, 'rc', 'libdup.a', os.path.join('a', 'common.o'), os.path.join('b', 'common.o')])
-    text = run_process([PYTHON, EMAR, 't', 'libdup.a'], stdout=PIPE).stdout
-    assert text.count('common.o') == 2, text
-
-    # With fastcomp we don't support duplicate members so this should generate
-    # a warning.  With the wasm backend (lld) this is fully supported.
-    cmd = [PYTHON, EMCC, 'main.c', '-L.', '-ldup']
-    if self.is_wasm_backend():
-      run_process(cmd)
-      self.assertContained('a\nb...\n', run_js('a.out.js'))
-    else:
-      err = self.expect_fail(cmd)
-      self.assertIn('libdup.a: archive file contains duplicate entries', err)
-      self.assertIn('error: undefined symbol: a', err)
-      # others are not duplicates - the hashing keeps them separate
-      self.assertEqual(err.count('duplicate: '), 1)
-      self.assertContained('a\nb...\n', run_js('a.out.js'))
-
   def test_export_from_archive(self):
     export_name = 'this_is_an_entry_point'
     full_export_name = '_' + export_name

diff --git a/tools/arfile.py b/tools/arfile.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python
+# Copyright 2019 The Emscripten Authors.  All rights reserved.
+# Emscripten is available under two separate licenses, the MIT license and the
+# University of Illinois/NCSA Open Source License.  Both these licenses can be
+# found in the LICENSE file.
+
+"""Utility functions for parsing 'ar' files.
+
+This is needed in emscripten because command line tools such as llvm-ar are not
+able to deal with archives containing many files with the same name.  Despite
+this, linkers are expected to handle this case and emscripten needs to emulate
+linker behaviour when using the fastcomp backend.
+
+See https://en.wikipedia.org/wiki/Ar_(Unix)
+"""
+
+from __future__ import print_function
+
+import struct
+import os
+import sys
+
+MAGIC = b'!<arch>\n'
+builtin_open = open
+
+
+class ArError(Exception):
+  """Base exception."""
+  pass
+
+
+class ArInfo(object):
+  def __init__(self, name, offset, timestamp, owner, group, mode, size, data):
+    self.name = name
+    self.offset = offset
+    self.timestamp = timestamp
+    self.owner = owner
+    self.group = group
+    self.mode = mode
+    self.size = size
+    self.data = data
+
+
+class ArFile(object):
+  def __init__(self, filename):
+    self.filename = filename
+    self._file = builtin_open(filename, 'rb')
+    magic = self._file.read(len(MAGIC))
+    if MAGIC != magic:
+      raise ArError('not an ar file: ' + filename)
+    self.members = []
+    self.members_map = {}
+    self.offset_to_info = {}
+
+  def _read_member(self):
+    offset = self._file.tell()
+    name = self._file.read(16)
+    if len(name) == 0:
+      return None
+    name = name.strip()
+    timestamp = self._file.read(12).strip()
+    owner = self._file.read(6).strip()
+    group = self._file.read(6).strip()
+    mode = self._file.read(8).strip()
+    size = int(self._file.read(10))
+    ending = self._file.read(2)
+    if ending != b'\x60\n':
+      raise ArError('invalid ar header')
+    data = self._file.read(size)
+    if mode.strip():
+      mode = int(mode)
+    if owner.strip():
+      owner = int(owner)
+    if group.strip():
+      group = int(group)
+    if size % 2:
+      if self._file.read(1) != '\n':
+        raise ArError('invalid ar header')
+
+    return ArInfo(name.decode('utf-8'), offset, timestamp, owner, group, mode, size, data)
+
+  def next(self):
+    while True:
+      # Keep reading entries until we find a non-special one
+      info = self._read_member()
+      if not info:
+        return None
+      if info.name == '//':
+        # Special file containing long filenames
+        self.name_data = info.data
+      elif info.name == '/':
+        # Special file containing symbol table
+        num_entries = struct.unpack('>I', info.data[:4])[0]
+        self.sym_offsets = struct.unpack('>%dI' % num_entries, info.data[4:4 + 4 * num_entries])
+        symbol_data = info.data[4 + 4 * num_entries:-1]
+        symbol_data = symbol_data.rstrip(b'\0')
+        if symbol_data:
+          self.symbols = symbol_data.split(b'\0')
+        else:
+          self.symbols = []
+        if len(self.symbols) != num_entries:
+          raise ArError('invalid symbol table')
+      else:
+        break
+
+    # This entry has a name from the "//" name section.
+    if info.name[0] == '/':
+      name_offset = int(info.name[1:])
+      if name_offset < 0 or name_offset >= len(self.name_data):
+        raise ArError('invalid extended filename section')
+      name_end = self.name_data.find(b'\n', name_offset)
+      info.name = self.name_data[name_offset:name_end].decode('utf-8')
+    info.name = info.name.rstrip('/')
+    self.members.append(info)
+    self.members_map[info.name] = info
+    self.offset_to_info[info.offset] = info
+    return info
+
+  def getsymbols(self):
+    return zip(self.symbols, self.sym_offsets)
+
+  def getmember(self, id):
+    """Polymophic member accessor that takes either and index or a name."""
+    if isinstance(id, int):
+      return self.getmember_by_index(id)
+    return self.getmember_by_name(id)
+
+  def getmember_by_name(self, name):
+    self.getmembers()
+    return self.members_map[name]
+
+  def getmember_by_index(self, index):
+    self.getmembers()
+    return self.members[index]
+
+  def getmembers(self):
+    while self.next():
+      pass
+    return self.members
+
+  def list(self):
+    for m in self.getmembers():
+      sys.stdout.write(m.name + '\n')
+
+  def extractall(self, path="."):
+    names_written = set()
+    for m in self.getmembers():
+      filename = m.name
+      if filename in names_written:
+        basename = filename
+        count = 1
+        while filename in names_written:
+          filename = basename + '.' + str(count)
+          count += 1
+
+      names_written.add(filename)
+      full_name = os.path.join(path, filename)
+      with builtin_open(full_name, 'wb') as f:
+        f.write(m.data)
+
+    return sorted(list(names_written))
+
+  def close(self):
+    self._file.close()
+
+  def __enter__(self):
+    return self
+
+  def __exit__(self, type, value, traceback):
+    self.close()
+
+
+def open(filename):
+  return ArFile(filename)
+
+
+def is_arfile(filename):
+  """Return True if name points to a ar archive that we
+  are able to handle, else return False.
+  """
+  try:
+    t = open(filename)
+    t.close()
+    return True
+  except ArError:
+    return False
+
+
+if __name__ == '__main__':
+  open(sys.argv[1]).list()
+  open(sys.argv[1]).extractall()