Skip to content

Commit

Permalink
Implement ar-file parsing in python (#8681)
Browse files Browse the repository at this point in the history
This takes a different approach to the problem of duplicate files
in ar archives.

This makes the emar wrapper unnecessary solving the problem at ar file
extraction time instead.
  • Loading branch information
sbc100 authored Jun 3, 2019
1 parent ee231b0 commit 1d198e4
Show file tree
Hide file tree
Showing 5 changed files with 224 additions and 180 deletions.
89 changes: 5 additions & 84 deletions emar.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,98 +6,19 @@

"""Archive helper script
This script acts as a frontend replacement for `ar`. See emcc.
This is needed because, unlike a traditional linker, emscripten can't handle
archive with duplicate member names. This is because emscripten extracts
archive to a temporary location and duplicate filenames will clobber each
other in this case.
This script is a simple wrapper around llvm-ar. It used to have special
handling for duplicate basenames in order to allow bitcode linking process to
read such files. This is now handled by using tools/arfile.py to read archives.
"""

# TODO(sbc): Implement `ar x` within emscripten, in python, to avoid this issue
# and delete this file.

from __future__ import print_function
import hashlib
import os
import shutil
import sys

from tools.toolchain_profiler import ToolchainProfiler
from tools import shared
from tools.response_file import substitute_response_files, create_response_file

if __name__ == '__main__':
ToolchainProfiler.record_process_start()


#
# Main run() function
#
def run():
args = substitute_response_files(sys.argv)
newargs = [shared.LLVM_AR] + args[1:]

to_delete = []

# The 3 argmuent form of ar doesn't involve other files. For example
# 'ar x libfoo.a'.
if len(newargs) > 3:
cmd = newargs[1]
if 'r' in cmd:
# We are adding files to the archive.
# Normally the output file is then arg 2, except in the case were the
# a or b modifiers are used in which case its arg 3.
if 'a' in cmd or 'b' in cmd:
out_arg_index = 3
else:
out_arg_index = 2

contents = set()
if os.path.exists(newargs[out_arg_index]):
cmd = [shared.LLVM_AR, 't', newargs[out_arg_index]]
output = shared.check_call(cmd, stdout=shared.PIPE).stdout
contents.update(output.split('\n'))

# Add a hash to colliding basename, to make them unique.
for j in range(out_arg_index + 1, len(newargs)):
orig_name = newargs[j]
full_name = os.path.abspath(orig_name)
dirname = os.path.dirname(full_name)
basename = os.path.basename(full_name)
if basename not in contents:
contents.add(basename)
continue
h = hashlib.md5(full_name.encode('utf-8')).hexdigest()[:8]
parts = basename.split('.')
parts[0] += '_' + h
newname = '.'.join(parts)
full_newname = os.path.join(dirname, newname)
assert not os.path.exists(full_newname)
try:
shutil.copyfile(orig_name, full_newname)
newargs[j] = full_newname
to_delete.append(full_newname)
contents.add(newname)
except:
# it is ok to fail here, we just don't get hashing
contents.add(basename)
pass

if shared.DEBUG:
print('emar:', sys.argv, ' ==> ', newargs, file=sys.stderr)

response_filename = create_response_file(newargs[3:], shared.get_emscripten_temp_dir())
to_delete += [response_filename]
newargs = newargs[:3] + ['@' + response_filename]

if shared.DEBUG:
print('emar:', sys.argv, ' ==> ', newargs, file=sys.stderr)

try:
return shared.run_process(newargs, stdin=sys.stdin, check=False).returncode
finally:
for d in to_delete:
shared.try_delete(d)
newargs = [shared.LLVM_AR] + sys.argv[1:]
return shared.run_process(newargs, stdin=sys.stdin, check=False).returncode


if __name__ == '__main__':
Expand Down
14 changes: 6 additions & 8 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import hashlib
import json
import os
import random
import re
import shutil
import sys
Expand Down Expand Up @@ -5164,26 +5163,25 @@ def test_iostream_and_determinism(self):
return 0;
}
'''
num = 5
num = 3

def test():
print('(iteration)')
time.sleep(random.random() / (10 * num)) # add some timing nondeterminism here, not that we need it, but whatever
time.sleep(1.0)
self.do_run(src, 'hello world\n77.\n')
ret = open('src.cpp.o.js', 'rb').read()
if self.get_setting('WASM') and not self.get_setting('WASM2JS'):
ret += open('src.cpp.o.wasm', 'rb').read()
return ret

builds = [test() for i in range(num)]
print(list(map(len, builds)))
print([len(b) for b in builds])
uniques = set(builds)
if len(uniques) != 1:
i = 0
for unique in uniques:
for i, unique in enumerate(uniques):
open('unique_' + str(i) + '.js', 'wb').write(unique)
i += 1
assert 0, 'builds must be deterministic, see unique_X.js'
# builds must be deterministic, see unique_N.js
self.assertEqual(len(uniques), 1)

def test_stdvec(self):
self.do_run_in_out_file_test('tests', 'core', 'test_stdvec')
Expand Down
37 changes: 5 additions & 32 deletions tests/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -1450,16 +1450,10 @@ def test_archive_duplicate_basenames(self):
''')
run_process([PYTHON, EMCC, os.path.join('b', 'common.c'), '-c', '-o', os.path.join('b', 'common.o')])

try_delete('liba.a')
run_process([PYTHON, EMAR, 'rc', 'liba.a', os.path.join('a', 'common.o'), os.path.join('b', 'common.o')])

# Verify that archive contains basenames with hashes to avoid duplication
text = run_process([PYTHON, EMAR, 't', 'liba.a'], stdout=PIPE).stdout
self.assertEqual(text.count('common.o'), 1)
self.assertContained('common_', text)
for line in text.split('\n'):
# should not have huge hash names
self.assertLess(len(line), 20, line)
try_delete('libdup.a')
run_process([PYTHON, EMAR, 'rc', 'libdup.a', os.path.join('a', 'common.o'), os.path.join('b', 'common.o')])
text = run_process([PYTHON, EMAR, 't', 'libdup.a'], stdout=PIPE).stdout
self.assertEqual(text.count('common.o'), 2)

create_test_file('main.c', r'''
void a(void);
Expand All @@ -1469,30 +1463,9 @@ def test_archive_duplicate_basenames(self):
b();
}
''')
err = run_process([PYTHON, EMCC, 'main.c', '-L.', '-la'], stderr=PIPE).stderr
self.assertNotIn('archive file contains duplicate entries', err)
run_process([PYTHON, EMCC, 'main.c', '-L.', '-ldup'])
self.assertContained('a\nb...\n', run_js('a.out.js'))

# Using llvm-ar directly should cause duplicate basenames
try_delete('libdup.a')
run_process([LLVM_AR, 'rc', 'libdup.a', os.path.join('a', 'common.o'), os.path.join('b', 'common.o')])
text = run_process([PYTHON, EMAR, 't', 'libdup.a'], stdout=PIPE).stdout
assert text.count('common.o') == 2, text

# With fastcomp we don't support duplicate members so this should generate
# a warning. With the wasm backend (lld) this is fully supported.
cmd = [PYTHON, EMCC, 'main.c', '-L.', '-ldup']
if self.is_wasm_backend():
run_process(cmd)
self.assertContained('a\nb...\n', run_js('a.out.js'))
else:
err = self.expect_fail(cmd)
self.assertIn('libdup.a: archive file contains duplicate entries', err)
self.assertIn('error: undefined symbol: a', err)
# others are not duplicates - the hashing keeps them separate
self.assertEqual(err.count('duplicate: '), 1)
self.assertContained('a\nb...\n', run_js('a.out.js'))

def test_export_from_archive(self):
export_name = 'this_is_an_entry_point'
full_export_name = '_' + export_name
Expand Down
191 changes: 191 additions & 0 deletions tools/arfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
#!/usr/bin/env python
# Copyright 2019 The Emscripten Authors. All rights reserved.
# Emscripten is available under two separate licenses, the MIT license and the
# University of Illinois/NCSA Open Source License. Both these licenses can be
# found in the LICENSE file.

"""Utility functions for parsing 'ar' files.
This is needed in emscripten because command line tools such as llvm-ar are not
able to deal with archives containing many files with the same name. Despite
this, linkers are expected to handle this case and emscripten needs to emulate
linker behaviour when using the fastcomp backend.
See https://en.wikipedia.org/wiki/Ar_(Unix)
"""

from __future__ import print_function

import struct
import os
import sys

MAGIC = b'!<arch>\n'
builtin_open = open


class ArError(Exception):
"""Base exception."""
pass


class ArInfo(object):
def __init__(self, name, offset, timestamp, owner, group, mode, size, data):
self.name = name
self.offset = offset
self.timestamp = timestamp
self.owner = owner
self.group = group
self.mode = mode
self.size = size
self.data = data


class ArFile(object):
def __init__(self, filename):
self.filename = filename
self._file = builtin_open(filename, 'rb')
magic = self._file.read(len(MAGIC))
if MAGIC != magic:
raise ArError('not an ar file: ' + filename)
self.members = []
self.members_map = {}
self.offset_to_info = {}

def _read_member(self):
offset = self._file.tell()
name = self._file.read(16)
if len(name) == 0:
return None
name = name.strip()
timestamp = self._file.read(12).strip()
owner = self._file.read(6).strip()
group = self._file.read(6).strip()
mode = self._file.read(8).strip()
size = int(self._file.read(10))
ending = self._file.read(2)
if ending != b'\x60\n':
raise ArError('invalid ar header')
data = self._file.read(size)
if mode.strip():
mode = int(mode)
if owner.strip():
owner = int(owner)
if group.strip():
group = int(group)
if size % 2:
if self._file.read(1) != '\n':
raise ArError('invalid ar header')

return ArInfo(name.decode('utf-8'), offset, timestamp, owner, group, mode, size, data)

def next(self):
while True:
# Keep reading entries until we find a non-special one
info = self._read_member()
if not info:
return None
if info.name == '//':
# Special file containing long filenames
self.name_data = info.data
elif info.name == '/':
# Special file containing symbol table
num_entries = struct.unpack('>I', info.data[:4])[0]
self.sym_offsets = struct.unpack('>%dI' % num_entries, info.data[4:4 + 4 * num_entries])
symbol_data = info.data[4 + 4 * num_entries:-1]
symbol_data = symbol_data.rstrip(b'\0')
if symbol_data:
self.symbols = symbol_data.split(b'\0')
else:
self.symbols = []
if len(self.symbols) != num_entries:
raise ArError('invalid symbol table')
else:
break

# This entry has a name from the "//" name section.
if info.name[0] == '/':
name_offset = int(info.name[1:])
if name_offset < 0 or name_offset >= len(self.name_data):
raise ArError('invalid extended filename section')
name_end = self.name_data.find(b'\n', name_offset)
info.name = self.name_data[name_offset:name_end].decode('utf-8')
info.name = info.name.rstrip('/')
self.members.append(info)
self.members_map[info.name] = info
self.offset_to_info[info.offset] = info
return info

def getsymbols(self):
return zip(self.symbols, self.sym_offsets)

def getmember(self, id):
"""Polymophic member accessor that takes either and index or a name."""
if isinstance(id, int):
return self.getmember_by_index(id)
return self.getmember_by_name(id)

def getmember_by_name(self, name):
self.getmembers()
return self.members_map[name]

def getmember_by_index(self, index):
self.getmembers()
return self.members[index]

def getmembers(self):
while self.next():
pass
return self.members

def list(self):
for m in self.getmembers():
sys.stdout.write(m.name + '\n')

def extractall(self, path="."):
names_written = set()
for m in self.getmembers():
filename = m.name
if filename in names_written:
basename = filename
count = 1
while filename in names_written:
filename = basename + '.' + str(count)
count += 1

names_written.add(filename)
full_name = os.path.join(path, filename)
with builtin_open(full_name, 'wb') as f:
f.write(m.data)

return sorted(list(names_written))

def close(self):
self._file.close()

def __enter__(self):
return self

def __exit__(self, type, value, traceback):
self.close()


def open(filename):
return ArFile(filename)


def is_arfile(filename):
"""Return True if name points to a ar archive that we
are able to handle, else return False.
"""
try:
t = open(filename)
t.close()
return True
except ArError:
return False


if __name__ == '__main__':
open(sys.argv[1]).list()
open(sys.argv[1]).extractall()
Loading

0 comments on commit 1d198e4

Please sign in to comment.