Skip to content
Snippets Groups Projects
Commit b0b663f2 authored by Christof Kaufmann's avatar Christof Kaufmann
Browse files

Initial commit

parents
Branches
No related tags found
No related merge requests found
Pipeline #1840 failed
# Owncloud Download Synchronizer
This is a little tool to synchronize files from a public link share to a local directory. Features:
- only download, not upload
- update files based on timestamp of local and upstream files
- include / exclude glob patterns to select files to download
- ignore glob patterns to select local files to keep
- logging to screen and file
- dry-run mode
Basic usage:
``` bash
python oc-downsync.py https://owncloud.example.com/s/aSdFgHjKl local-dir/
```
With password:
``` bash
python oc-downsync.py https://owncloud.example.com/s/aSdFgHjKl --password=123456 local-dir/
```
Only download `*.md` files (and delete other files, if synchronized to local-dir before):
``` bash
python oc-downsync.py https://owncloud.example.com/s/aSdFgHjKl local-dir/ --include='*.md'
```
Download all files except `*.md` files, but do download `/README.md` (and delete other `*.md` files, if synchronized to local-dir before):
``` bash
python oc-downsync.py https://owncloud.example.com/s/aSdFgHjKl local-dir/ --exclude='*.md' --include='/README.md'
```
Keep all files in local directory `/bin` and subdirectories although it is not in the public share:
``` bash
python oc-downsync.py https://owncloud.example.com/s/aSdFgHjKl local-dir/ --ignore='/bin/*'
```
See actions before applying them:
``` bash
python oc-downsync.py https://owncloud.example.com/s/aSdFgHjKl local-dir/ --log-to-screen --dry-run
```
#!/usr/bin/env python3
# %% stuff without main
import argparse
import copy
from datetime import datetime, timezone
import fnmatch
from glob import glob
import logging
import owncloud # type: ignore[import-untyped]
import os
import requests
import shutil
import sys
import textwrap
from typing import Literal, Optional, Sequence
## argparse utilities
# custom argparse action to keep the order and type of argument groups
class OrderedArgs(argparse.Action):
'''Action to add multiple arguments to the same destination, keeping order, option name and value'''
def __init__(self, option_strings, dest, nargs=None, const=None, **kwargs):
if nargs == 0:
raise ValueError('nargs for ordered append actions must be != 0; if arg '
'strings are not supplying the value to append, '
'the append const action may be more appropriate')
if const is not None and nargs != argparse.OPTIONAL:
raise ValueError(f'nargs must be {repr(argparse.OPTIONAL)} to supply const')
super().__init__(option_strings, dest, **kwargs)
def __call__(self, parser, namespace, values, option_string=None):
items = getattr(namespace, self.dest, None)
if items is None:
items = []
elif not isinstance(items, list):
raise ValueError(f'Argument {self.dest} must be a list, not {type(items)}')
name = option_string.removeprefix('-').removeprefix('-')
items = copy.deepcopy(items)
if isinstance(values, str):
values = [values]
for val in values:
items.append((name, val))
setattr(namespace, self.dest, items)
# help text formatter that prints newlines and default values
class RawTextArgumentDefaultsHelpFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
pass
## logging utilities
# add trace level, see https://gist.github.com/numberoverzero/f803ebf29a0677b6980a5a733a10ca71
def _install_trace_logger():
if hasattr(logging, 'TRACE'):
return
level = logging.TRACE = logging.DEBUG - 5
def log_logger(self, message, *args, **kwargs):
if self.isEnabledFor(level):
self._log(level, message, args, **kwargs) # pylint: disable=protected-access
logging.getLoggerClass().trace = log_logger
def log_root(msg, *args, **kwargs):
kwargs.setdefault('stacklevel', 3)
logging.log(level, msg, *args, **kwargs)
logging.addLevelName(level, "TRACE")
logging.trace = log_root
_install_trace_logger()
# logging setup, also for uncaught exceptions
def setup_logger(name, filename, loglevel='WARNING', screenlogging=False, log_exceptions=False):
formatter = logging.Formatter(fmt='[%(levelname).1s] %(asctime)-15s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
logg = logging.getLogger(name)
# make parent directory if it does not exist
if filename and not any((h.baseFilename == filename for h in logg.handlers if isinstance(h, logging.FileHandler))):
os.makedirs(os.path.dirname(filename), exist_ok=True)
file_handler = logging.FileHandler(filename, mode='a')
file_handler.setFormatter(formatter)
logg.addHandler(file_handler)
if screenlogging and not any((isinstance(h, logging.StreamHandler) for h in logg.handlers)):
screen_handler = logging.StreamHandler(stream=sys.stdout)
screen_handler.setFormatter(formatter)
logg.addHandler(screen_handler)
def handle_exception(exc_type, exc_value, exc_traceback):
if issubclass(exc_type, KeyboardInterrupt):
sys.__excepthook__(exc_type, exc_value, exc_traceback)
return
logg.critical("Uncaught exception. Please send a bug report to Christof Kaufmann!\n", exc_info=(exc_type, exc_value, exc_traceback))
if log_exceptions:
sys.excepthook = handle_exception
logg.setLevel(level=loglevel)
return logg
## functions
def _get_remote_fileinfos(oc: owncloud.Client, includes_excludes: Sequence[tuple[str, str]], logger: logging.Logger) -> list[owncloud.FileInfo]:
# get remote files
remote_paths = oc.list('.', depth=200)
remote_files = [f for f in remote_paths if not f.is_dir()]
if not includes_excludes:
return remote_files
# print([f.path for f in remote_files])
# init remote files depending on whether the first filter is include or exclude
first_in_or_ex = includes_excludes[0][0]
if first_in_or_ex == 'include':
remote_files_filtered = []
else:
remote_files_filtered = remote_files.copy()
# filter with remaining include / exclude rules
for in_or_ex, pat in includes_excludes:
if in_or_ex == 'include':
remote_files_filtered += [f for f in remote_files if fnmatch.fnmatch(f.path, pat)]
else:
remote_files_filtered = [f for f in remote_files_filtered if not fnmatch.fnmatch(f.path, pat)]
# print([f.path for f in remote_files_filtered])
if logger.getEffectiveLevel() >= logging.DEBUG:
filtered_files = list({f.path for f in remote_files} - {f.path for f in remote_files_filtered})
logger.trace('The following files have been filtered out from download candidates due to include / exclude rules:\n' + str(filtered_files)) # type: ignore[attr-defined]
logger.trace('The following files are left as download candidates after filtering with include / exclude rules:\n' + str([f.path for f in remote_files_filtered])) # type: ignore[attr-defined]
return remote_files_filtered
def _get_local_files(target_dir: str, logger: logging.Logger) -> list[str]:
local_paths = glob(os.path.join(target_dir, '**'), recursive=True)
local_files = [f for f in local_paths if os.path.exists(f) and not os.path.isdir(f)]
logger.trace('Found the following local files:\n' + str(local_files)) # type: ignore[attr-defined]
return local_files
def _find_file_changes(target_dir: str, local_files: list[str], remote_fileinfos: list[owncloud.FileInfo], ignores: Sequence[str], logger: logging.Logger) -> tuple[list[str], list[str]]:
def is_ignored(path):
for pat in ignores:
if fnmatch.fnmatch(path, pat):
return True
return False
to_remove = []
for local_path in local_files:
rel_path = os.path.relpath(local_path, target_dir)
remote_path = os.path.join('/', rel_path)
only_local = all((rf.path[1:] != rel_path for rf in remote_fileinfos))
if only_local:
if is_ignored(remote_path):
logger.trace(f"[ignore] Local file {local_path} is left as it is since it is ignored. Would be deleted otherwise.") # type: ignore[attr-defined]
continue
logger.debug(f"[to delete] Local file {local_path} is NOT in remote (anymore). Will be deleted.") # pylint: disable=logging-fstring-interpolation
# delete
to_remove.append(rel_path)
continue
else:
logger.trace(f"[no action] Local file {local_path} is in remote. Will not be deleted (but maybe updated).") # type: ignore[attr-defined]
to_download = []
for f in remote_fileinfos:
remote_mtime = f.get_last_modified().replace(tzinfo=timezone.utc)
remote_path = f.path
rel_path = f.path[1:] # remove leading /
local_path = os.path.join(target_dir, rel_path)
if not os.path.exists(local_path):
logger.debug(f"[to download] Remote file {f.path} does not exist locally as {local_path}. Will be downloaded.") # pylint: disable=logging-fstring-interpolation
# copy
to_download.append(rel_path)
continue
if not os.path.isfile(local_path):
if is_ignored(remote_path):
logger.trace(f"[ignore] Local non-regular file {local_path} is left as it is since it is ignored. Would be deleted and re-downloaded otherwise") # type: ignore[attr-defined]
continue
logger.debug(f"Local path {local_path} is not a regular file, but remote path {f.path} is a regular file. Will be deleted and re-downloaded.") # pylint: disable=logging-fstring-interpolation
# delete and copy
to_remove.append(rel_path)
to_download.append(rel_path)
continue
local_timestamp = os.path.getmtime(local_path)
local_mtime = datetime.fromtimestamp(local_timestamp, tz=timezone.utc)
if remote_mtime != local_mtime:
timedelta = remote_mtime - local_mtime
if timedelta.total_seconds() < 0:
td_str = f'{-timedelta} h older'
else:
td_str = f'{timedelta} h newer'
if is_ignored(remote_path):
logger.trace(f"[ignore] Local file {local_path} is left as it is since it is ignored. Would be overridden otherwise, since remote file {f.path} is {td_str} than local file.") # type: ignore[attr-defined]
continue
logger.debug(f"[to download] Remote file {f.path} is {td_str} than local file {local_path}. Will be re-downloaded.") # pylint: disable=logging-fstring-interpolation
# copy
to_download.append(rel_path)
continue
else:
logger.trace(f"[no action] Local file {local_path} exists with same timestamp in remote. Nothing to do.") # type: ignore[attr-defined]
return to_remove, to_download
def _remove_files(to_remove: list[str], target_dir: str, logger: logging.Logger, dry_run: bool=False) -> None:
dry = ' (dry-run)' if dry_run else ''
for rel_path in to_remove:
path = os.path.join(target_dir, rel_path)
if os.path.isdir(path):
logger.info(f"[delete] Deleting directory {path}.{dry}") # pylint: disable=logging-fstring-interpolation
if not dry_run:
shutil.rmtree(path)
else:
logger.info(f"[delete] Deleting file {path}.{dry}") # pylint: disable=logging-fstring-interpolation
if not dry_run:
os.remove(path)
# check if the directory is empty
if (dir_path := os.path.dirname(path)) != target_dir and not os.listdir(dir_path):
logger.info(f"[delete] Deleting empty directory {dir_path}.{dry}") # pylint: disable=logging-fstring-interpolation
if not dry_run:
os.rmdir(dir_path)
def _download_files(oc: owncloud.Client, remote_fileinfos: list[owncloud.FileInfo], to_download: list[str], target_dir: str, logger: logging.Logger, dry_run: bool=False) -> None:
dry = ' (dry-run)' if dry_run else ''
for rel_path in to_download:
local_path = os.path.join(target_dir, rel_path)
remote_path = os.path.join('/', rel_path)
logger.info(f"[download] Downloading {remote_path} to {local_path}.{dry}") # pylint: disable=logging-fstring-interpolation
if dry_run:
continue
os.makedirs(os.path.dirname(local_path), exist_ok=True)
oc.get_file(remote_path, local_path)
f = next((f for f in remote_fileinfos if f.path == remote_path))
remote_mtimestamp = f.get_last_modified().replace(tzinfo=timezone.utc).timestamp()
os.utime(local_path, (remote_mtimestamp, remote_mtimestamp))
def sync(
owncloud_link: str,
local_path: str,
password: str='',
ignores: Sequence[str]=(),
inex: Sequence[tuple[str, str]]=(),
log_file: Optional[str]='~/oc-downsync.log',
log_level: Literal['TRACE', 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']='INFO',
log_to_screen: bool=False,
dry_run: bool=False,
) -> int:
logger = setup_logger(name='oc-downsync', filename=log_file, loglevel=log_level, screenlogging=log_to_screen, log_exceptions=True)
dry = ' in dry-run mode' if dry_run else ''
logger.info(f'Started {sys.argv[0]} to download files from {owncloud_link} to {local_path}{dry}.') # pylint: disable=logging-fstring-interpolation
try:
oc = owncloud.Client.from_public_link(owncloud_link, folder_password=password)
remote_fileinfos = _get_remote_fileinfos(oc=oc, includes_excludes=inex, logger=logger)
local_files = _get_local_files(target_dir=local_path, logger=logger)
to_remove, to_download = _find_file_changes(target_dir=local_path, local_files=local_files, remote_fileinfos=remote_fileinfos, ignores=ignores, logger=logger)
if not to_download and not to_remove:
logger.info('All included files are locally up-to-date or ignored.')
_remove_files(to_remove=to_remove, target_dir=local_path, logger=logger, dry_run=dry_run)
_download_files(oc=oc, remote_fileinfos=remote_fileinfos, to_download=to_download, target_dir=local_path, logger=logger, dry_run=dry_run)
except owncloud.HTTPResponseError as e:
if '401' in str(e):
logger.error('Got HTTP error 401, which means "Unauthorized". Can be caused by a wrong owncloud password or link or the link is not publicly shared.')
else:
logger.error('Got HTTP error: %s', e)
return 1
except requests.ConnectionError as e:
logger.error('Got a connection error. Maybe you got the URL wrong. Full error msg: %s', e)
return 1
finally:
oc.logout()
return 0
def parse_args(argv: Sequence[str]):
parser = argparse.ArgumentParser(
description='Download files from a public owncloud link share to a local target directory. Existing files will be re-downloaded, if the modified date of the files differ. You can select which upstream files to consider using --include and --exclude and you can leave existing local files using --ignore.',
formatter_class=RawTextArgumentDefaultsHelpFormatter,
)
inex_group = parser.add_argument_group(
'Include/Exclude upstream files',
description=textwrap.dedent('''\
Files matching the glob-pattern will be included or excluded, respectively. Paths can start with / for the root of the shared directory. Can be used multiple times and the order matters. If --include is not used before --exclude, all files will be included. You can e. g. exclude all *.txt files and include a specific *.txt file again:
--exclude='*.txt' --include='/include_me.txt'
But if you do it the other way round, no file will be left, since at first only /include_me.txt will be included and then all *.txt files will be excluded:
--include='/include_me.txt' --exclude='*.txt'
Note: * matches also / in the path.
Note: Protect * using single-quotes around the argument.
Note: include/exclude patterns are applied to the remote files only. Local files are not filtered by these patterns. This also means excluded files will be deleted locally if they are not ignored by --ignore. If you want to exclude files from being overriden or deleted, use --ignore instead.'''))
inex_group.add_argument('--include', action=OrderedArgs, dest='inex', help="Include file pattern. Example: --include='/*.txt' will match all *.txt files in all subdirectories. If this is used as only include/exclude option, all other files will be excluded.")
inex_group.add_argument('--exclude', action=OrderedArgs, dest='inex', help="Exclude file pattern. Example: --exclude='?*/*.txt' will exclude *.txt files in subdirectories, but keep *.txt files in the share's root directory.")
parser.add_argument('--ignore', action='append', default=[], dest='ignores', help="Glob-pattern to ignore local files. Can be used multiple times. Matching existing files will not be deleted or overridden. However, they will still be downloaded, if they are not present locally. This is useful for files that are not in the remote share (anymore), but you want to keep them locally or for files that you have updated locally. Example: --ignore='*augmented/*' will keep the temporary files in any directory ending with 'augmented'.")
parser.add_argument('--password', default='', help='In case the public owncloud share link requires a password, provide it with this argument.')
parser.add_argument('--log-level', default='INFO', choices=['TRACE', 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
help=textwrap.dedent('''\
Set the log level:
- TRACE will inform about include / exclude / ignore filter rules and files that require no action. This will output all file paths multiple times.
- DEBUG will output upcoming actions (e. g. "... will be deleted"). So only changed files will appear.
- INFO will output the actual actions (e. g. "deleting ..."). So only changes will be printed.
- WARNING, ERROR: No specific messages on this level currently.
- CRITICAL will output unexpected errors that stop the tool. Should not happen. This would be an error in the programm, which need to be fixed.
'''))
parser.add_argument('--log-file', help='Set the log file path to enable file logging.')
parser.add_argument('--log-to-screen', action='store_true', help='Log to screen (independently of logging to file).')
parser.add_argument('-n', '--dry-run', action='store_true', help='Do not delete local files or download remote files, but log as usual.')
parser.add_argument('owncloud_link', help='The public owncloud share link to download from. Example: https://hs-bochum.sciebo.de/s/asdf1234ASDF123')
parser.add_argument('local_path', help='Destination directory to download files to.')
return parser.parse_args(args=argv)
# %% main
if __name__ == "__main__":
opts = parse_args(sys.argv[1:])
err_code = sync(
owncloud_link=opts.owncloud_link,
local_path=opts.local_path,
password=opts.password,
ignores=opts.ignores,
inex=opts.inex,
log_file=opts.log_file,
log_level=opts.log_level,
log_to_screen=opts.log_to_screen,
dry_run=opts.dry_run,
)
sys.exit(err_code)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment