Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Read diff in batches #109

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion shelephant/detail.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def copy(
color = theme(theme_name.lower())

for file in src:
if not os.path.isfile(file):
if not os.path.exists(file):
raise OSError(f'Input file "{file:s}" does not exists')

if not os.path.isdir(dest_dir):
Expand Down
166 changes: 97 additions & 69 deletions shelephant/rsync.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,95 @@ def to_remote(
)


def _detail_diff(
source_dir: str,
dest_dir: str,
files: list[str],
checksum: bool,
verbose: bool,
temp_dir: str,
):
temp_file = os.path.join(temp_dir, "rsync.txt")
files = [os.path.normpath(file) for file in files]

with open(temp_file, "w") as file:
file.write("\n".join(files))

# Run without printing output

opt = "-nai"

if checksum:
opt += "c"

cmd = 'rsync {opt:s} --files-from="{files:s}" "{source_dir:s}" "{dest_dir:s}"'.format(
source_dir=source_dir, dest_dir=dest_dir, files=temp_file, opt=opt
)

lines = list(filter(None, exec_cmd(cmd, verbose).split("\n")))
lines = [line for line in lines if line[1] in ["f", "L"]]

if len(lines) == 0:
return {
"skip": np.ones((len(files)), dtype=bool),
"create": np.zeros((len(files)), dtype=bool),
"overwrite": np.zeros((len(files)), dtype=bool),
}

check_paths = []
for line in lines:
if line[1] == "f":
check_paths.append(line.split(" ", 1)[1])
elif line[:2] == "cL":
check_paths.append(line.split(" ", 1)[1].split(" -> ", 1)[0])

mode = np.zeros((len(check_paths)), dtype=np.int16)

for i, line in enumerate(lines):
# todo: split send `<` and receive `>`?
# ref: https://stackoverflow.com/a/12037164/2646505
if line[0] == ">" or line[0] == "<":
if line[2] == "+":
mode[i] = 1 # create
else:
mode[i] = 2 # overwrite
elif line[0] == "c" or line[1] == "L":
mode[i] = 1 # create
elif line[0] == ".":
pass
else:
raise OSError(f'Unknown cryptic output "{line:s}"')

sorter = np.argsort(files)
source_paths = np.array(files, dtype=str)[sorter]

i = np.argsort(check_paths)
check_paths = np.array(check_paths, dtype=str)[i]
mode = mode[i]

test = np.in1d(source_paths, check_paths)

idx = np.searchsorted(check_paths, source_paths)
idx = np.where(test, idx, 0)
ret = np.where(test, mode[idx], 0)
ret = ret.astype(np.int16)
out = np.empty_like(ret)
out[sorter] = ret

return {
"skip": out == 0,
"create": out == 1,
"overwrite": out == 2,
}


def diff(
source_dir: str,
dest_dir: str,
files: list[str],
checksum: bool = False,
verbose: bool = False,
batch_size: int = 3000,
) -> np.array:
r"""
Check if files are different using *rsync*.
Expand All @@ -139,75 +222,20 @@ def diff(
"""

with tempfile.TemporaryDirectory() as temp_dir:
temp_file = os.path.join(temp_dir, "rsync.txt")
files = [os.path.normpath(file) for file in files]

with open(temp_file, "w") as file:
file.write("\n".join(files))

# Run without printing output

opt = "-nai"

if checksum:
opt += "c"

cmd = 'rsync {opt:s} --files-from="{files:s}" "{source_dir:s}" "{dest_dir:s}"'.format(
source_dir=source_dir, dest_dir=dest_dir, files=temp_file, opt=opt
)

lines = list(filter(None, exec_cmd(cmd, verbose).split("\n")))
lines = [line for line in lines if line[1] in ["f", "L"]]

if len(lines) == 0:
return {
"skip": np.ones((len(files)), dtype=bool),
"create": np.zeros((len(files)), dtype=bool),
"overwrite": np.zeros((len(files)), dtype=bool),
}

check_paths = []
for line in lines:
if line[1] == "f":
check_paths.append(line.split(" ", 1)[1])
elif line[:2] == "cL":
check_paths.append(line.split(" ", 1)[1].split(" -> ", 1)[0])

mode = np.zeros((len(check_paths)), dtype=np.int16)

for i, line in enumerate(lines):
# todo: split send `<` and receive `>`?
# ref: https://stackoverflow.com/a/12037164/2646505
if line[0] == ">" or line[0] == "<":
if line[2] == "+":
mode[i] = 1 # create
else:
mode[i] = 2 # overwrite
elif line[0] == "c" or line[1] == "L":
mode[i] = 1 # create
elif line[0] == ".":
pass
else:
raise OSError(f'Unknown cryptic output "{line:s}"')

sorter = np.argsort(files)
source_paths = np.array(files, dtype=str)[sorter]

i = np.argsort(check_paths)
check_paths = np.array(check_paths, dtype=str)[i]
mode = mode[i]
files = np.array(files)
n = len(files)
ret = {
"skip": np.ones(n, dtype=bool),
"create": np.zeros(n, dtype=bool),
"overwrite": np.zeros(n, dtype=bool),
}

test = np.in1d(source_paths, check_paths)
chunks = int(np.ceil(n / float(batch_size)))
devided = np.array_split(np.arange(n, dtype=int), chunks)

idx = np.searchsorted(check_paths, source_paths)
idx = np.where(test, idx, 0)
ret = np.where(test, mode[idx], 0)
ret = ret.astype(np.int16)
out = np.empty_like(ret)
out[sorter] = ret
for _, selection in enumerate(devided):
stat = _detail_diff(source_dir, dest_dir, files[selection], checksum, verbose, temp_dir)
for key in ret:
ret[key][selection] = stat[key]

return {
"skip": out == 0,
"create": out == 1,
"overwrite": out == 2,
}
return ret