diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml new file mode 100644 index 0000000..9809e2c --- /dev/null +++ b/.github/workflows/unit_tests.yml @@ -0,0 +1,29 @@ +name: File Auto Expiry Unit Tests +on: [pull_request] + +jobs: + tests: + runs-on: ubuntu-latest + + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Test with pytest + run: | + pip install pytest pytest-cov + pytest source/tests/test_utils.py --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html + + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9daf04e --- /dev/null +++ b/.gitignore @@ -0,0 +1,177 @@ +# Created by https://www.toptal.com/developers/gitignore/api/python +# Edit at https://www.toptal.com/developers/gitignore?templates=python + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +source/__pycache__ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +# End of https://www.toptal.com/developers/gitignore/api/python diff --git a/README.md b/README.md index cb2c25a..94a99ed 100644 --- a/README.md +++ b/README.md @@ -1 +1,14 @@ -# infra_file_auto_expiry \ No newline at end of file +# infra_file_auto_expiry + +Relating to issue: https://github.com/WATonomous/infra-config/issues/1143 + +This project is meant to help automatically expire and delete files. It's currently at the stage of gathering all necessary information about file deletion easier. In the future, it is required to add a notification system for users whose files are to be deleted, and an actual deletion system. + +Currently it moves through every single top level folder in a directory, and checks whether it is expired or not. This means that every single file in that directory tree must be expired. As it does this, it gathers all the users who created files in that directory, and the days since the most RECENT atime, ctime, and mtime of ANY file in that directory. It only collects these for folders which have been confirmed to be expired. + +To collect the expiry information of all top level directories in a given path: +sudo $(which python3) /path_to_directory/infra_file_auto_expiry/infra_file_auto_expiry/source/main.py collect-file-info path_to_check_expiry_of + +This will return a jsonl file. You can then use this in the following command to tabulate all expired paths that are associated with a particular user. + +sudo $(which python3) /path_to_directory/infra_file_auto_expiry/infra_file_auto_expiry/source/main.py collect-creator-info path_to_jsonl_file \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ffe491a --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +typer>=0.12.3 diff --git a/source/data/expiry_constants.py b/source/data/expiry_constants.py new file mode 100644 index 0000000..f20f5ea --- /dev/null +++ b/source/data/expiry_constants.py @@ -0,0 +1,18 @@ +# Ignore the following directories because they contain +# large number of generated files. Files in ignored +# directories are assumed to have not been accessed recently. +DIRECTORIES_TO_IGNORE = { + "ros/humble", + + "lib/python3.10", + "lib/python3.9", + "lib/python3.8", + "lib/python3.7", + + "lib64/python3.10", + "lib64/python3.9", + "lib64/python3.8", + "lib64/python3.7", +} + +SECS_PER_DAY = 86400 \ No newline at end of file diff --git a/source/data/tuples.py b/source/data/tuples.py new file mode 100644 index 0000000..f5f260e --- /dev/null +++ b/source/data/tuples.py @@ -0,0 +1,4 @@ +from collections import namedtuple + +expiry_tuple = namedtuple("file_tuple", "is_expired, creators, atime, ctime, mtime") +creator_tuple = namedtuple("creator_tuple", "username, uid, gid") \ No newline at end of file diff --git a/source/main.py b/source/main.py new file mode 100644 index 0000000..87c2a23 --- /dev/null +++ b/source/main.py @@ -0,0 +1,34 @@ +from utils.interface import * +from data.expiry_constants import SECS_PER_DAY +import time +import typer +app = typer.Typer() + +@app.command() +def collect_file_info(path: str, save_file: str = "", days_for_expiry: int = 10): + """ + Collects information about the top level paths within a given folder path + And dumps it into a json file, specified by the save_file flag + """ + scrape_time = time.time() + seconds_for_expiry = int(days_for_expiry) * SECS_PER_DAY + expiry_threshold = scrape_time - seconds_for_expiry + collect_expired_file_information(folder_path=path, + save_file=save_file, + scrape_time=scrape_time, + expiry_threshold=expiry_threshold) + +@app.command() +def collect_creator_info(file_info: str, save_file: str = ""): + """ + Tabulates the paths that relate to specific users, based on a given jsonl path + That jsonl path should be the result of calling the collect_file_info function + It then dumps the new information into another json file, specified by the save_file flag + """ + scrape_time = time.time() + collect_creator_information(path_info_file=file_info, + save_file=save_file, + scrape_time=scrape_time) + +if __name__ == "__main__": + app() \ No newline at end of file diff --git a/source/tests/test_utils.py b/source/tests/test_utils.py new file mode 100644 index 0000000..a7e604b --- /dev/null +++ b/source/tests/test_utils.py @@ -0,0 +1,94 @@ +import unittest +import os +import sys +from unittest.mock import MagicMock, patch +module_path = os.path.dirname( + os.path.dirname(os.path.abspath(__file__)) +) +sys.path.append(module_path) + +from utils.interface import * +from utils.expiry_checks import * + +class TestUtils(unittest.TestCase): + @patch("pwd.getpwuid") + @patch("os.stat") + def test_get_file_creator(self, patch_stat, patch_pwd): + """ + Tests retrieving the user name of a file owner + """ + # Successfully retrieves file owner + patch_stat.return_value.st_uid=5111 + patch_stat.return_value.st_gid=1555 + patch_pwd.return_value.pw_name="tester_account" + + file_creator = get_file_creator("/home/machung/test.txt") + self.assertEqual(file_creator[0], "tester_account") + self.assertEqual(file_creator[1], 5111) + self.assertEqual(file_creator[2], 1555) + + @patch('os.stat') + def test_is_expired_filepath(self, patch_stat): + """ + Tests the is_expired_file function + """ + time_for_expiry = 30 # 30 days + patch_stat.st_atime = 5 # 5 days + patch_stat.st_ctime = 5 # 5 days + patch_stat.st_mtime = 5 # 5 days + scrape_time = 50 # 50 days + expiry_threshold = scrape_time - time_for_expiry + + # Days since last access is 5 < 20 + # The file should be expired + self.assertTrue(is_expired_filepath("test_name.txt", patch_stat, expiry_threshold)[0]) + + expiry_threshold = -20 # change to 10 days + # Days since last access is 5 > -20 + # The file should not be expired + expiry_test_result = is_expired_filepath("test_name.txt", patch_stat, expiry_threshold) + self.assertFalse(expiry_test_result[0]) + self.assertTrue(5, expiry_test_result[2]) + self.assertTrue(5, expiry_test_result[3]) + self.assertTrue(5, expiry_test_result[4]) + + @patch('os.listdir') + @patch("os.stat") + @patch("utils.expiry_checks.is_expired") + def test_is_expired_folder(self, patch_expired, patch_stat, patch_path): + """ + Tests the is_expired_folder function. This should return + True (is_expired) if all subdirectories and files are also expired. + + The values of atime, ctime, and mtime should be the largest timestamps + seen from the entire folder tree. This indicates the most recent timestamp. + In the test we just simulate those timestamps by using smaller integers. + """ + mocked_file_expiry_results_1 = MagicMock() + mocked_file_expiry_results_2 = MagicMock() + + mocked_file_expiry_results_1.configure_mock( + is_expired = True, creators = ("a", 0, 0), atime = 1000, + ctime = 2000, mtime = 10000) + # atime, ctime, mtime = 5, 7, and 10 days respectively + + mocked_file_expiry_results_2.configure_mock( + is_expired = False, creators = ("b", 1, 1), atime = 2000, + ctime = 6000 , mtime = 5000) + # atime, ctime, mtime = 7, 6, and 15 days respectively + + patch_expired.side_effect = [mocked_file_expiry_results_1, + mocked_file_expiry_results_2] + patch_path.return_value = ["one.txt", "two.txt"] + + # atime, ctime, mtime for the folder itself is 5 days for all + patch_stat.st_atime = patch_stat.st_ctime = patch_stat.st_mtime = 3000 + + res = is_expired_folder("test_path", patch_stat, 0) + self.assertEqual(False, res[0]) + self.assertEqual(3000 , res[2]) + self.assertEqual(6000 , res[3]) + self.assertEqual(10000 , res[4]) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/source/utils/expiry_checks.py b/source/utils/expiry_checks.py new file mode 100644 index 0000000..3f9d6c3 --- /dev/null +++ b/source/utils/expiry_checks.py @@ -0,0 +1,160 @@ +import os +import stat +from data.expiry_constants import * +from data.expiry_constants import DIRECTORIES_TO_IGNORE +from data.tuples import * +from utils.file_creator import * + +def is_expired(path, expiry_threshold): + """ Interface function to return if a file-structure is expired or not. + TODO: Provide implementation for character device files, blocks, sockets. + """ + + path_stat = os.stat(path) + if stat.S_ISREG(path_stat.st_mode): # normal file + return is_expired_filepath(path, path_stat, expiry_threshold) + + elif stat.S_ISDIR(path_stat.st_mode): # folder + return is_expired_folder(path, path_stat, expiry_threshold) + + elif stat.S_ISLNK(path_stat.st_mode): # symlink + return is_expired_link(path, path_stat, expiry_threshold) + + elif stat.S_ISCHR(path_stat.st_mode): # character driver + return is_expired_filepath(path, path_stat, expiry_threshold) + + elif stat.S_ISBLK(path_stat.st_mode): # block + return is_expired_filepath(path, path_stat, expiry_threshold) + + elif stat.S_ISFIFO(path_stat.st_mode): # pipe + return is_expired_filepath(path, path_stat, expiry_threshold) + + elif stat.S_ISSOCK(path_stat.st_mode): # socket + return is_expired_filepath(path, path_stat, expiry_threshold) + +def is_expired_filepath(path, file_stat, expiry_threshold): + """ + Checks the last time a file or folder has been accessed. If it has not + been accessed in the days specified, then return True. False if otherwise. + + It will also return a tuple containing the creator name and id, along with the + file atime, ctime, and mtime + """ + if os.path.islink(path): + file_stat = os.lstat(path) + creator = get_file_creator(path) + + # collect days since last atime, ctime, and mtime of each file + atime = (file_stat.st_atime) + ctime = (file_stat.st_ctime) + mtime = (file_stat.st_mtime) + # If all atime, ctime, mtime are more than the expiry date limit, + # then this return true, along with the other information + return expiry_tuple( + is_expired=timestamps_are_expired(atime, ctime, mtime, + expiry_threshold), + creators={creator}, + atime=atime, + ctime=ctime, + mtime=mtime) + +def timestamps_are_expired(atime, ctime, mtime, expiry_threshold): + """ + Checks if all atime, ctime, and mtime are expired. + Returns True when all are expired. + """ + return ((atime < expiry_threshold) and + (ctime < expiry_threshold) and + (mtime < expiry_threshold)) + +def is_expired_link(path, file_stat, expiry_threshold): + """ + Checks if a symlink is expired. + It will also return a tuple containing the creator name and id, along with the + file atime, ctime, and mtime + """ + if not os.path.islink(path): + raise Exception("Given path is not a valid link.") + + #TODO: implement edge case for when the link points to a recursive directory + # For now, just handle by only considering the link itself + return is_expired_filepath(path=path, file_stat=file_stat, + expiry_threshold=expiry_threshold) + + +def is_expired_folder(folder_path, folder_stat, expiry_threshold): + """ + Goes through all files in a folder. Returns true if ALL files in directory + are expire. + + It will also return a tuple containing the creator name and id, along with the + most recent atime, ctime, and mtime + """ + file_creators = set() + # timestamps for the folder itself + recent_atime = folder_stat.st_atime + recent_ctime = folder_stat.st_ctime + recent_mtime = folder_stat.st_mtime + folder_creator = get_file_creator(folder_path) + file_creators.add(folder_creator) + is_expired_flag = timestamps_are_expired(recent_atime, + recent_ctime, + recent_mtime, + expiry_threshold) + + if check_folder_if_known(path=folder_path): + return expiry_tuple(is_expired_flag, file_creators, recent_atime, + recent_ctime, recent_mtime ) + + # Check expiry status of all files and subdirectories within the folder + for member_file_name in os.listdir(folder_path): + # Tracks the unique names of file creators in the directory + member_file_path = os.path.join(folder_path, member_file_name) + + if not os.path.exists(member_file_path) or os.path.islink(member_file_path): + continue + + file_expiry_information = is_expired(path=str(member_file_path), + expiry_threshold=expiry_threshold) + + if file_expiry_information.is_expired: + # First val in the expiry is always the boolean true or false + is_expired_flag = False + + creators = file_expiry_information.creators # collects tuple of (name, uid, gid) + # If file_expiry_information is from a folder, it should already contain a set + # with the information of file creators + if isinstance(creators, set): + for user in creators: + file_creators.add(user) + # if file_expiry_information is from a file, and the creator is not + # already in the set, then they're information is added. + else: + file_creators.add(creators) + + # update atime, ctime, mtime + recent_atime = max(recent_atime, file_expiry_information.atime) + recent_ctime = max(recent_ctime, file_expiry_information.ctime) + recent_mtime = max(recent_mtime, file_expiry_information.mtime) + + return expiry_tuple(is_expired_flag, file_creators, recent_atime, + recent_ctime, recent_mtime) + +def check_folder_if_known(path): + """ + Checks if a folder path is within a known set of directories + that are large and typically non-edited by users. + """ + base_name = os.path.basename(path) + parent_path_name = os.path.basename(os.path.dirname(path)) + if f"{parent_path_name}/{base_name}" in DIRECTORIES_TO_IGNORE: + return True + +def catch_link_issues(path): + """ + Returns True if a link leads to a link or a directory + """ + if os.path.islink(path): + real_path = os.path.realpath(path) + if os.path.islink(real_path) or os.path.isdir(real_path): + return True \ No newline at end of file diff --git a/source/utils/file_creator.py b/source/utils/file_creator.py new file mode 100644 index 0000000..6fe68b2 --- /dev/null +++ b/source/utils/file_creator.py @@ -0,0 +1,19 @@ +import os +import pwd +from data.tuples import * + +def get_file_creator(path): + """ + Returns a tuple including the file creator username, + their UID, and GID in that order respectively. + + string file_path: The absolute path of the file + """ + # Get the UID of the file or directory owner + # Get the username associated with the UID + try: + username = pwd.getpwuid(os.stat(path).st_uid).pw_name + except KeyError: + """ FIX THIS LATER""" + return f"user{os.stat(path).st_uid}" + return creator_tuple(username, os.stat(path).st_uid, os.stat(path).st_gid) diff --git a/source/utils/interface.py b/source/utils/interface.py new file mode 100644 index 0000000..3421394 --- /dev/null +++ b/source/utils/interface.py @@ -0,0 +1,142 @@ +import os +import pwd +import json +import datetime +import time +from data.expiry_constants import * +from data.tuples import * +from utils.expiry_checks import is_expired + +def get_file_creator(path): + """ + Returns a tuple including the file creator username, + their UID, and GID in that order respectively. + + string file_path: The absolute path of the file + """ + # Get the UID of the file or directory owner + # Get the username associated with the UID + try: + username = pwd.getpwuid(os.stat(path).st_uid).pw_name + except KeyError: + """ FIX THIS LATER""" + return f"user{os.stat(path).st_uid}" + return creator_tuple(username, os.stat(path).st_uid, os.stat(path).st_gid) + +def notify_file_creators(): + """ + TODO: implement proper notification system + Currently is just the code to print information to a text file + """ + +def scan_folder_for_expired(folder_path, expiry_threshold): + """Generator function which iterates the expired top level folders + in a given directory. + + Collects expiry information including: + - all contributing users in the folder + - the days since the most recent atime, ctime, and mtime of the entire folder + """ + if not os.path.isdir(folder_path) : + raise Exception("Given path directory "+ folder_path) + + for entry in os.scandir(folder_path): + if os.path.exists(entry.path): + expiry_result = is_expired(entry.path, expiry_threshold) + print(entry.path) + # path, creator tuple (name, uid, gid), atime, ctime, mtime + yield entry.path, expiry_result.is_expired, expiry_result.creators, \ + expiry_result.atime, expiry_result.ctime, expiry_result.mtime + +def collect_expired_file_information(folder_path, save_file, scrape_time, expiry_threshold): + """ + Interface function which collects which directories are 'expired' + + String folder_path: The folder to scan for expired files + String save_file: The jsonl file path to save the information to, + ie "path_name.jsonl" + Int scrape_time: the time at the start of the information scrape + Int seconds_for_expiry: The amount of days since last usage that indicates + expiry + """ + if not os.path.isdir(folder_path): + raise Exception("Base folder does not exist") + + if not save_file: + # save_file path not given + save_file = f"file_information_{str(datetime.datetime.fromtimestamp(scrape_time))}.jsonl" + + path_info = dict() + for path, is_expired, creators, atime, ctime, mtime in scan_folder_for_expired( + folder_path, expiry_threshold): + # handles generating the dictionary + + path_info[path] = { + "path": path, # storing pathname so we keep it when we transfer the dictionary to jsonl + "creators": [creator for creator in creators], + "expired": is_expired, + "time_variables": { + "atime_datetime": str(datetime.datetime.fromtimestamp(atime)), + "ctime_datetime": str(datetime.datetime.fromtimestamp(ctime)), + "mtime_datetime": str(datetime.datetime.fromtimestamp(mtime)), + }} + + write_jsonl_information(path_info, save_file, scrape_time) + +def write_jsonl_information(dict_info, file_path, scrape_time): + current_time = time.time() + + with open(file_path, "w") as file: + file.write(json.dumps({"scrape_time:": scrape_time, + "scrape_time_datetime": str(datetime.datetime.fromtimestamp(scrape_time))}) + "\n") + file.write(json.dumps({"time_for_scrape_sec": current_time - scrape_time, + "time_for_scrape_min": (current_time - scrape_time) / 60})) + + for key in dict_info: + file.write(json.dumps(dict_info[key]) + "\n") + +def collect_creator_information(path_info_file, save_file, scrape_time): + """ + Returns a dictionary relating path information to each creator + Must be given the return value of form similar to the output of + collect_expired_file_information() + + String path_info_file: A jsonl file path containing information about a + certain path. This should be the result of calling the collect_file_information + function. + + String save_file: The jsonl file path to save the information to, + ie "path_name.jsonl" + + Int scrape_time: The time at the start of the information scrape. + """ + if not os.path.exists(path_info_file): + raise Exception("Given file for path information does not exist") + + if not save_file: + # save_file path not given + save_file = f"creator_information_{str(datetime.datetime.fromtimestamp(scrape_time))}.jsonl" + + creator_info = dict() + with open(path_info_file, "r+") as file: + lines = file.readlines() + + for line in lines[2:]: + # One jsonl line of path inforamtion + path_data = json.loads(line) + # check if the path is expired + if path_data["expired"]: + # take all unique creators and make a new dictionary about them + for user in path_data["creators"]: + time_vars = path_data["time_variables"] + if user[1] in creator_info: + creator_info[user[1]]["paths"][path_data["path"]] = time_vars + + else: + creator_info[user[1]] = { + "paths": {path_data["path"]: time_vars}, + "name": user[0], + "uid": user[1], + "gid": user[2]} + + write_jsonl_information(creator_info, save_file, scrape_time)