Skip to content

Commit

Permalink
Rewrite sync script logic
Browse files Browse the repository at this point in the history
Since currently the verify and add of filestore is slow,
and there is no way to get hashes from just filenames something else is needed.

IPFS already has these features in the mfs functions.
By adding the files with --nocopy, and then copy them into the ipfs files structures
All of this can be tracked

Recently modified files (by rsync) are now added, and then mfs is update.
The old hash is fetched before being replaced, that way it can be cleaned up from mfs and pins.

Similarly for deleted files rsync dry-run is used to get list of files that should be deleted
Hash is fetched from mfs, and mfs, pin and actual file are removed by bash loop instead of rsync.

This also updates the hash tree in mfs, so we can grab the new root hash that way, which seems to be much faster.

As fallback full add is still used (when no hash exists in mfs)
  • Loading branch information
NiKiZe committed Jul 29, 2018
1 parent 0b25521 commit bc71758
Showing 1 changed file with 102 additions and 37 deletions.
139 changes: 102 additions & 37 deletions sync-gentoo-distfiles.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,68 +17,133 @@ SRC="trumpetti.atm.tut.fi::gentoo$1"
#SRC="rsync://rsync.us.gentoo.org/gentoo-portage" # for the rest of the world
#Uncomment the following line only if you have been granted access to masterportage.gentoo.org
#SRC="rsync://masterportage.gentoo.org/gentoo-portage"
DSTBASE="${HOME}/gentoo-distfiles"
REPONAME="gentoo-distfiles"
DSTBASE="${HOME}/${REPONAME}"
DST="${DSTBASE}$1"

echo "Started update at" `date` >> $0.log 2>&1
# some optimizations for large datasets; https://github.com/ipfs/notes/issues/212
# Sharding is needed to handle directories that otherwise generates to large objects - here we force it
ipfs config --json Experimental.ShardingEnabled true
ipfs config --json Datastore.NoSync true
# allow --nocopy
ipfs config --json Experimental.FilestoreEnabled true

getmfsrepohash() {
ipfs files stat --hash /${REPONAME} 2> /dev/null
}

OLDREPOHASH=$(getmfsrepohash)

echo "Started rsync at " `date` >> $0.log 2>&1
logger -t rsync "re-rsyncing the gentoo-portage tree"
${RSYNC} ${OPTS} ${SRC} ${DST} >> $0.log 2>&1
echo "End: "`date` >> $0.log 2>&1
echo "Done rsync at "`date` >> $0.log 2>&1

# TODO collect any difference in mtime from the above
# BUG during rsync some files are updated/replaces, so this delete mangling might not help much
# TODO handle if delete.log is missing - which timestamp should be used?
LASTSYNCDONE=$(stat -c %Z $0.delete.log)
# grab last delete date, and remove 24 hours
# this should give us all modified files
OLDFILEDATE=$(date -u --date=@$((${LASTSYNCDONE} - 24*60*60)))

# Using tempfile to get last line https://github.com/VictorBjelkholm/arch-mirror/blob/master/ipfsify.sh
HASHFILE=$0.ipfsadd.log
mv ${HASHFILE} ${HASHFILE}.old

removeold_ipfs() {
if ipfs files ls $1 2>&1 > /dev/null; then
echo removing $1 $2 >> $0.delete.log 2>&1
ipfs files rm -r --local $1 >> $0.delete.log 2>&1
[[ "$2" != "" ]] && (ipfs pin rm -r --local $2 >> $0.delete.log 2>&1) &
fi
}

# TODO We can only do this add if we actually have data for it
if [[ "$OLDREPOHASH" != "" ]]; then
echo "Old repo hash $OLDREPOHASH "`date` >> $0.log 2>&1
find ${DSTBASE}/ -newerct "${OLDFILEDATE}" \( -type f -o -type l \) | while read l; do
lifp=/${REPONAME}${l#${DSTBASE}}
if [[ -L "$l" ]]; then
# there are some issues with symlink - it is mostly ok to ignore them
OLDHASH=""
else
OLDHASH=$(ipfs files stat --hash $lifp) 2> /dev/null
fi
echo "doing add for $lifp old file hash: $OLDHASH" >> $0.log 2>&1
# TODO use this as a pipe instead - should avoid opening and closing
(ipfs add --nocopy --raw-leaves --local $l > ${HASHFILE}) >> $0.log 2>&1
HASH="$(tail -n1 ${HASHFILE} | cut -d ' ' -f2)"
[[ "$HASH" == "$OLDHASH" ]] && continue
removeold_ipfs $lifp $OLDHASH >> $0.log 2>&1
echo "got $lifp with hash $HASH" >> $0.log 2>&1
ipfs files cp /ipfs/${HASH} $lifp >> $0.log 2>&1
done
NEWREPOHASH=$(getmfsrepohash)
[[ "$NEWREPOHASH" != "$OLDREPOHASH" ]] && echo "Root hash changed from $OLDREPOHASH to $NEWREPOHASH" >> $0.log 2>&1
echo "Add/update recently changed files new hash $NEWREPOHASH "`date` >> $0.log 2>&1

# do a dryrun of sync and grab the delete lines
mv $0.delete.log $0.delete.log.old
${RSYNC} ${OPTS} --dry-run --delete ${SRC} ${DST} 2>&1 | tee $0.delete.log >> $0.log
cat $0.delete.log >> $0.log
grep ^deleting "$0.delete.log" | cut -d ' ' -f 2- | while read l; do
# hopefully this will be easier in the future
lifp=/${REPONAME}/${l#${DSTBASE}}
if [[ -L "$l" ]]; then
# there are some issues with symlink - it is mostly ok to ignore them
OLDHASH=""
else
OLDHASH=$(ipfs files stat --hash $lifp) 2> /dev/null
fi
echo "removing $lifp $OLDHASH" >> $0.log 2>&1
removeold_ipfs $lifp $OLDHASH >> $0.log 2>&1
echo "removing actuall file ${DSTBASE}/${l} $OLDHASH" >> $0.log 2>&1
rm -rf ${DSTBASE}/${l} >> $0.log 2>&1
done
NEWREPOHASH=$(getmfsrepohash)
[[ "$NEWREPOHASH" != "$OLDREPOHASH" ]] && echo "Root hash changed from $OLDREPOHASH to $NEWREPOHASH" >> $0.log 2>&1
echo "Remove old files new hash $NEWREPOHASH "`date` >> $0.log 2>&1
fi # ipfs mfs files does not yet exist, first run?

# do a dryrun of sync and grab the delete lines
mv $0.delete.log $0.delete.log.old
${RSYNC} ${OPTS} --dry-run --delete ${SRC} ${DST} 2>&1 | tee $0.delete.log >> $0.log
grep ^deleting "$0.delete.log" | cut -d ' ' -f 2- | while read l; do
# TODO grep the delete file for files to do ipfs pin rm, but that requires the hash for it, so needs a lookup in $0.ipfsadd.log
# hopefully this will be easier in the future
break
done

cat $0.delete.log >> $0.log
echo "Delete dl done: "`date` >> $0.log 2>&1
# make sure we don't refer to anything that might have been removed,
# see https://github.com/ipfs/go-ipfs/issues/4260#issuecomment-406827554
# Update, we need verify stuff, but with file-order it is on magnitude of an hour
mv verify.log verify.log.old
(time (ipfs filestore verify --local --file-order | grep -v ^ok)) 2>&1 | tee verify.log >> $0.log
echo "verify done: "`date` >> $0.log 2>&1
#mv verify.log verify.log.old
#(time (ipfs filestore verify --local --file-order | grep -v ^ok)) 2>&1 | tee verify.log >> $0.log
#echo "verify done: "`date` >> $0.log 2>&1
# verify on it's own don't seem to actually remove anything
grep -q -v ^ok verify.log && (time ipfs repo gc) >> $0.log 2>&1
echo "gc done: "`date` >> $0.log 2>&1
#grep -q -v ^ok verify.log && (time ipfs repo gc) >> $0.log 2>&1
#echo "gc done: "`date` >> $0.log 2>&1

# some optimizations for large datasets; https://github.com/ipfs/notes/issues/212
# Sharding is needed to handle directories that otherwise generates to large objects - here we force it
ipfs config --json Experimental.ShardingEnabled true
ipfs config --json Datastore.NoSync true
# allow --nocopy
ipfs config --json Experimental.FilestoreEnabled true

# Using tempfile to get last line https://github.com/VictorBjelkholm/arch-mirror/blob/master/ipfsify.sh
HASHFILE=$0.ipfsadd.log
mv ${HASHFILE} ${HASHFILE}.old
# re-adding the tree takes over an hour
# gentoo-distfiles might be a symlink so take it's childs /* and -w to wrap it
# symlinks in the tree might not yet be working; https://github.com/VictorBjelkholm/arch-mirror/issues/1
(time (ipfs add -w -r --nocopy --local ${DSTBASE}/* > ${HASHFILE})) >> $0.log 2>&1
HASH="$(tail -n1 ${HASHFILE} | cut -d ' ' -f2)"
HASH=$NEWREPOHASH
if [[ "$OLDREPOHASH" == "" ]] || [[ "$FULLADD" == "fulladd" ]]; then
(time (ipfs add -w -r --nocopy --local ${DSTBASE}/* > ${HASHFILE})) >> $0.log 2>&1
HASH="$(tail -n1 ${HASHFILE} | cut -d ' ' -f2)"
fi

#TODO check for existing files item, and if it exists check hash, only update if changed
ipfs files rm /gentoo-distfiles.old
ipfs files mv /gentoo-distfiles /gentoo-distfiles.old
ipfs files cp /ipfs/${HASH} /gentoo-distfiles
if [[ "$OLDREPOHASH" != "" ]]; then # TODO check existing old node, only update if changed
ipfs files ls /${REPONAME}.old 2>&1 > /dev/null && ipfs files rm -r /${REPONAME}.old >> $0.log 2>&1
ipfs files cp /ipfs/${OLDREPOHASH} /${REPONAME}.old >> $0.log 2>&1
fi
if [[ "$HASH" != "" ]] && [[ "$(getmfsrepohash)" != "$HASH" ]]; then
ipfs files cp /ipfs/${HASH} /${REPONAME}.new >> $0.log 2>&1
ipfs files rm -r /${REPONAME} >> $0.log 2>&1
ipfs files mv /${REPONAME}.new /${REPONAME} >> $0.log 2>&1
fi

echo "ipfs add ${HASH} done: "`date` >> $0.log 2>&1
logger -t rsync "sync gentoo-portage tree done IPFS ${HASH}"

# run ipfs name commands in background since they are slow
ipfs name publish /ipfs/${HASH} &
(ipfs name publish /ipfs/${HASH} >> $0.log 2>&1) &
# if ipns is mounted we get; "Error: cannot manually publish while IPNS is mounted" needs a workaround for that

# Add DNS; _dnslink.distfiles.gentoo.org TXT "dnslink=/ipfs/${HASH}"
# it speeds up name resolution since IPNS for the moment is "to" slow
[[ -x dnsupdate.sh ]] && [[ "${HASH}" != "" ]]&& sh dnsupdate.sh "dnslink=/ipfs/${HASH}"
[[ -x dnsupdate.sh ]] && [[ "${HASH}" != "" ]]&& sh dnsupdate.sh "dnslink=/ipfs/${HASH}" >> $0.log 2>&1
# example; dig txt _dnslink.arch.victor.earth
# symlinks might not yet be working; https://github.com/VictorBjelkholm/arch-mirror/issues/1

0 comments on commit bc71758

Please sign in to comment.