From 2b3c5c6cc679d61fc3061905c551aed3c6087930 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Tue, 18 Dec 2018 17:49:39 +0000 Subject: [PATCH 01/12] Ideas around rsync --- internal/pkg/pfsprovider/ansible/copy.go | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/internal/pkg/pfsprovider/ansible/copy.go b/internal/pkg/pfsprovider/ansible/copy.go index 2a866693..fcda67b7 100644 --- a/internal/pkg/pfsprovider/ansible/copy.go +++ b/internal/pkg/pfsprovider/ansible/copy.go @@ -11,6 +11,10 @@ func processDataCopy(volume registry.Volume, request registry.DataCopyRequest) e if err != nil { return err } + if cmd == "" { + log.Println("No files to copy for:", volume.Name) + return nil + } log.Printf("FAKE copy: %s", cmd) return nil @@ -22,24 +26,24 @@ func generateDataCopyCmd(volume registry.Volume, request registry.DataCopyReques return "", err } - cmd := fmt.Sprintf("sudo su `getent passwd %d | cut -d: -f1` %s", volume.Owner, rsync) + cmd := fmt.Sprintf("sudo -g '#%d' -u '#%d' %s", volume.Group, volume.Owner, rsync) + cmd = fmt.Sprintf("bash -c \"export JOB='%s' && %s\"", volume.JobName, cmd) return cmd, nil } func generateRsyncCmd(volume registry.Volume, request registry.DataCopyRequest) (string, error) { if request.Source == "" && request.Destination == "" { - log.Println("No files to copy for:", volume.Name) return "", nil } var flags string if request.SourceType == registry.Directory { - flags = "-r " + flags = "-r -ospgu --stats" } else if request.SourceType == registry.File { - flags = "" + flags = "-ospgu --stats" } else { return "", fmt.Errorf("unsupported source type %s for volume: %s", request.SourceType, volume.Name) } - return fmt.Sprintf("rsync %s%s %s", flags, request.Source, request.Destination), nil + return fmt.Sprintf("rsync %s %s %s", flags, request.Source, request.Destination), nil } From 91ff53198a06839ba07cf22c1871bdd056ba6cc5 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Fri, 21 Dec 2018 16:31:32 +0000 Subject: [PATCH 02/12] Remove invalid path from persistent burst buffers --- internal/pkg/dacctl/persistent.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/internal/pkg/dacctl/persistent.go b/internal/pkg/dacctl/persistent.go index 50377438..f131b432 100644 --- a/internal/pkg/dacctl/persistent.go +++ b/internal/pkg/dacctl/persistent.go @@ -117,8 +117,6 @@ func CreateVolumesAndJobs(volReg registry.VolumeRegistry, poolRegistry registry. JobVolume: volume.Name, // Even though its a persistent buffer, we add it here to ensure we delete buffer Paths: make(map[string]string), } - job.Paths[fmt.Sprintf("DW_PERSISTENT_STRIPED_%s", volume.Name)] = fmt.Sprintf( - "/mnt/dac/job/%s/multijob/%s", job.Name, volume.Name) err = volReg.AddJob(job) if err != nil { From 0de544c4fe0762879e099b18d2eef007fd4d0491 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Fri, 21 Dec 2018 16:40:38 +0000 Subject: [PATCH 03/12] Update example job Add DW_JOB_STRIPED into the environment in which rsync is executed --- internal/pkg/pfsprovider/ansible/copy.go | 3 ++- tools/slurm-test.sh | 9 +++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/internal/pkg/pfsprovider/ansible/copy.go b/internal/pkg/pfsprovider/ansible/copy.go index fcda67b7..e24e23ac 100644 --- a/internal/pkg/pfsprovider/ansible/copy.go +++ b/internal/pkg/pfsprovider/ansible/copy.go @@ -27,7 +27,8 @@ func generateDataCopyCmd(volume registry.Volume, request registry.DataCopyReques } cmd := fmt.Sprintf("sudo -g '#%d' -u '#%d' %s", volume.Group, volume.Owner, rsync) - cmd = fmt.Sprintf("bash -c \"export JOB='%s' && %s\"", volume.JobName, cmd) + dacHostBufferPath := fmt.Sprintf("/mnt/lustre/%s/global", volume.UUID) + cmd = fmt.Sprintf("bash -c \"export DW_JOB_STRIPED='%s' && %s\"", dacHostBufferPath, cmd) return cmd, nil } diff --git a/tools/slurm-test.sh b/tools/slurm-test.sh index c38e6f6f..1057e012 100755 --- a/tools/slurm-test.sh +++ b/tools/slurm-test.sh @@ -12,11 +12,16 @@ echo "#!/bin/bash #DW jobdw capacity=2TB access_mode=striped,private type=scratch #DW persistentdw name=mytestbuffer #DW swap 5MB -#DW stage_in source=/global/cscratch1/filename1 destination=\$DW_JOB_STRIPED/filename1 type=file -#DW stage_out source=\$DW_JOB_STRIPED/outdir destination=/global/scratch1/outdir type=directory +#DW stage_in source=/usr/local/bin/dacd /global/cscratch1/filename1 destination=\$DW_JOB_STRIPED/filename1 type=file +#DW stage_out source=\$DW_JOB_STRIPED/outdir destination=/tmp type=directory + env df -h swapon + +mkdir \$DW_JOB_STRIPED/outdir +df -h > \$DW_JOB_STRIPED/outdir/dfoutput + echo \$HOSTNAME " > use-persistent.sh From dbf00f410a64d195bb2eb4d5b015271f89092684 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Fri, 21 Dec 2018 16:43:04 +0000 Subject: [PATCH 04/12] Add ls output in the slurm example --- tools/slurm-test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/slurm-test.sh b/tools/slurm-test.sh index 1057e012..e6dfa53e 100755 --- a/tools/slurm-test.sh +++ b/tools/slurm-test.sh @@ -21,6 +21,7 @@ swapon mkdir \$DW_JOB_STRIPED/outdir df -h > \$DW_JOB_STRIPED/outdir/dfoutput +ls -al \$DW_JOB_STRIPED > \$DW_JOB_STRIPED/outdir/lsoutput echo \$HOSTNAME " > use-persistent.sh From 35577167e05d3f68d354cc84c4ed1a7b45124944 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Fri, 21 Dec 2018 16:43:40 +0000 Subject: [PATCH 05/12] Comment out umount that fails on slurm compute --- tools/dac-reset.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/dac-reset.sh b/tools/dac-reset.sh index 274e16fa..874c411c 100755 --- a/tools/dac-reset.sh +++ b/tools/dac-reset.sh @@ -7,8 +7,8 @@ set +a /usr/local/bin/etcdctl --key /etc/data-acc/pki/`hostname`.dac.hpc.cam.ac.uk-key.pem --cert /etc/data-acc/pki/`hostname`.dac.hpc.cam.ac.uk.pem --cacert /etc/data-acc/pki/ca.pem del --prefix '' # Kill all lustre filesystems -ssh slurm-cpu1 sudo umount -atl lustre -ssh slurm-cpu2 sudo umount -atl lustre +#ssh slurm-cpu1 sudo umount -atl lustre +#ssh slurm-cpu2 sudo umount -atl lustre ssh dac1 sudo umount -at lustre ssh dac2 sudo umount -at lustre ssh dac3 sudo umount -at lustre From 7050ea46a23413610bb27a170913604cd1f3dc45 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Fri, 21 Dec 2018 16:45:12 +0000 Subject: [PATCH 06/12] Always push out new data-acc binary --- dac-ansible/roles/data-acc/tasks/main.yml | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/dac-ansible/roles/data-acc/tasks/main.yml b/dac-ansible/roles/data-acc/tasks/main.yml index cfd81a41..82812222 100644 --- a/dac-ansible/roles/data-acc/tasks/main.yml +++ b/dac-ansible/roles/data-acc/tasks/main.yml @@ -1,13 +1,5 @@ --- -- name: check for installation of data-acc - become: yes - stat: - path: '{{data_acc_install_dir}}/{{data_acc_name}}/bin' - changed_when: false - register: data_acc_binary_dir - -- when: not data_acc_binary_dir.stat.exists - block: +- block: - name: download data_acc become: yes become_user: root @@ -33,7 +25,6 @@ remote_src: yes src: /tmp/{{data_acc_tgz}} dest: '{{data_acc_install_dir}}/{{data_acc_name}}' - creates: '{{data_acc_install_dir}}/{{data_acc_name}}/bin' always: - name: delete archive become: yes From 6dd34e23d2f17ddc51e152db6c4e96802fcce54b Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Fri, 21 Dec 2018 17:11:04 +0000 Subject: [PATCH 07/12] Ensure global dir setup before trying copy --- internal/pkg/pfsprovider/ansible/copy.go | 18 +++++++++++++++++- internal/pkg/pfsprovider/ansible/copy_test.go | 8 +++++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/internal/pkg/pfsprovider/ansible/copy.go b/internal/pkg/pfsprovider/ansible/copy.go index e24e23ac..bcda7bd6 100644 --- a/internal/pkg/pfsprovider/ansible/copy.go +++ b/internal/pkg/pfsprovider/ansible/copy.go @@ -4,6 +4,7 @@ import ( "fmt" "github.com/RSE-Cambridge/data-acc/internal/pkg/registry" "log" + "path" ) func processDataCopy(volume registry.Volume, request registry.DataCopyRequest) error { @@ -16,7 +17,22 @@ func processDataCopy(volume registry.Volume, request registry.DataCopyRequest) e return nil } - log.Printf("FAKE copy: %s", cmd) + log.Printf("Doing copy: %s", cmd) + + // Make sure global dir is setup correctly + // TODO: share code with mount better + // TODO: Probably should all get setup in fs-ansible really!! + mountDir := fmt.Sprintf("/mnt/lustre/%s", volume.UUID) + sharedDir := path.Join(mountDir, "/global") + if err := mkdir("localhost", sharedDir); err != nil { + return err + } + if err := fixUpOwnership("localhost", volume.Owner, volume.Group, sharedDir); err != nil { + return err + } + + // Do the copy + runner.Execute("localhost", cmd) return nil } diff --git a/internal/pkg/pfsprovider/ansible/copy_test.go b/internal/pkg/pfsprovider/ansible/copy_test.go index 94b1de33..5cc2d4b9 100644 --- a/internal/pkg/pfsprovider/ansible/copy_test.go +++ b/internal/pkg/pfsprovider/ansible/copy_test.go @@ -10,6 +10,8 @@ func Test_GenerateDataCopy(t *testing.T) { testVolume := registry.Volume{ Name: registry.VolumeName("asdf"), Owner: 1001, + Group: 1002, + UUID: "fsuuid", } request := registry.DataCopyRequest{} @@ -22,7 +24,7 @@ func Test_GenerateDataCopy(t *testing.T) { request.Destination = "dest" cmd, err = generateDataCopyCmd(testVolume, request) assert.Nil(t, err) - assert.Equal(t, "sudo su `getent passwd 1001 | cut -d: -f1` rsync source dest", cmd) + assert.Equal(t, "bash -c \"export DW_JOB_STRIPED='/mnt/lustre/fsuuid/global' && sudo -g '#1002' -u '#1001' rsync -ospgu --stats source dest\"", cmd) request.SourceType = registry.List request.Source = "list_filename" @@ -47,14 +49,14 @@ func Test_GenerateRsyncCmd(t *testing.T) { request.Destination = "dest" cmd, err = generateRsyncCmd(testVolume, request) assert.Nil(t, err) - assert.Equal(t, "rsync source dest", cmd) + assert.Equal(t, "rsync -ospgu --stats source dest", cmd) request.SourceType = registry.Directory request.Source = "source" request.Destination = "dest" cmd, err = generateRsyncCmd(testVolume, request) assert.Nil(t, err) - assert.Equal(t, "rsync -r source dest", cmd) + assert.Equal(t, "rsync -r -ospgu --stats source dest", cmd) request.SourceType = registry.List request.Source = "list_filename" From cdd8b74124d4bad657e7f224feea1621fa65c0aa Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Fri, 21 Dec 2018 17:26:06 +0000 Subject: [PATCH 08/12] Actually return any copy in errors --- internal/pkg/pfsprovider/ansible/copy.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/internal/pkg/pfsprovider/ansible/copy.go b/internal/pkg/pfsprovider/ansible/copy.go index bcda7bd6..820b3890 100644 --- a/internal/pkg/pfsprovider/ansible/copy.go +++ b/internal/pkg/pfsprovider/ansible/copy.go @@ -32,8 +32,7 @@ func processDataCopy(volume registry.Volume, request registry.DataCopyRequest) e } // Do the copy - runner.Execute("localhost", cmd) - return nil + return runner.Execute("localhost", cmd) } func generateDataCopyCmd(volume registry.Volume, request registry.DataCopyRequest) (string, error) { From 385453d130822378bd55656e61de967b3bfe148f Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Fri, 21 Dec 2018 17:38:16 +0000 Subject: [PATCH 09/12] Run copy as centos in the slurm test --- docker-slurm/burst_buffer.conf | 1 - tools/slurm-test.sh | 9 +++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docker-slurm/burst_buffer.conf b/docker-slurm/burst_buffer.conf index b2abf520..af36cbfc 100644 --- a/docker-slurm/burst_buffer.conf +++ b/docker-slurm/burst_buffer.conf @@ -1,4 +1,3 @@ -AllowUsers=root,slurm Flags=EnablePersistent,PrivateData StageInTimeout=3600 diff --git a/tools/slurm-test.sh b/tools/slurm-test.sh index e6dfa53e..1a3082a6 100755 --- a/tools/slurm-test.sh +++ b/tools/slurm-test.sh @@ -51,11 +51,12 @@ scontrol show burstbuffer squeue echo "***Use persistent buffer***" +adduser centos cat use-persistent.sh -su slurm -c 'sbatch use-persistent.sh' -su slurm -c 'sbatch use-persistent.sh' -su slurm -c 'sbatch use-persistent.sh' -su slurm -c 'sbatch use-persistent.sh' +su centos -c 'sbatch use-persistent.sh' +su centos -c 'sbatch use-persistent.sh' +su centos -c 'sbatch use-persistent.sh' +su centos -c 'sbatch use-persistent.sh' squeue sleep $SLEEP_INTERVAL From 4d8792ab7ce9c73d6b3657ef093e27e1f0fd1573 Mon Sep 17 00:00:00 2001 From: John Date: Wed, 2 Jan 2019 22:18:27 +0000 Subject: [PATCH 10/12] Escape the DW_JOB_STRIPED in rsync --- internal/pkg/pfsprovider/ansible/copy.go | 9 ++++++++- internal/pkg/pfsprovider/ansible/copy_test.go | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/internal/pkg/pfsprovider/ansible/copy.go b/internal/pkg/pfsprovider/ansible/copy.go index 820b3890..3fc8a84b 100644 --- a/internal/pkg/pfsprovider/ansible/copy.go +++ b/internal/pkg/pfsprovider/ansible/copy.go @@ -5,6 +5,7 @@ import ( "github.com/RSE-Cambridge/data-acc/internal/pkg/registry" "log" "path" + "strings" ) func processDataCopy(volume registry.Volume, request registry.DataCopyRequest) error { @@ -61,5 +62,11 @@ func generateRsyncCmd(volume registry.Volume, request registry.DataCopyRequest) return "", fmt.Errorf("unsupported source type %s for volume: %s", request.SourceType, volume.Name) } - return fmt.Sprintf("rsync %s %s %s", flags, request.Source, request.Destination), nil + return fmt.Sprintf("rsync %s %s %s", flags, + escapePath(request.Source), + escapePath(request.Destination)), nil +} + +func escapePath(path string) string { + return strings.Replace(path, "$DW_JOB_STRIPED", "\\$DW_JOB_STRIPED", 1) } diff --git a/internal/pkg/pfsprovider/ansible/copy_test.go b/internal/pkg/pfsprovider/ansible/copy_test.go index 5cc2d4b9..71aa8d57 100644 --- a/internal/pkg/pfsprovider/ansible/copy_test.go +++ b/internal/pkg/pfsprovider/ansible/copy_test.go @@ -20,11 +20,11 @@ func Test_GenerateDataCopy(t *testing.T) { assert.Empty(t, cmd) request.SourceType = registry.File - request.Source = "source" + request.Source = "$DW_JOB_STRIPED/source" request.Destination = "dest" cmd, err = generateDataCopyCmd(testVolume, request) assert.Nil(t, err) - assert.Equal(t, "bash -c \"export DW_JOB_STRIPED='/mnt/lustre/fsuuid/global' && sudo -g '#1002' -u '#1001' rsync -ospgu --stats source dest\"", cmd) + assert.Equal(t, "bash -c \"export DW_JOB_STRIPED='/mnt/lustre/fsuuid/global' && sudo -g '#1002' -u '#1001' rsync -ospgu --stats \\$DW_JOB_STRIPED/source dest\"", cmd) request.SourceType = registry.List request.Source = "list_filename" From c4c9b56fc19fa591eba02025a3a972ff6e6737d1 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Thu, 3 Jan 2019 09:17:56 +0000 Subject: [PATCH 11/12] Fix up stage_in command example --- tools/slurm-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/slurm-test.sh b/tools/slurm-test.sh index 1a3082a6..693e9172 100755 --- a/tools/slurm-test.sh +++ b/tools/slurm-test.sh @@ -12,7 +12,7 @@ echo "#!/bin/bash #DW jobdw capacity=2TB access_mode=striped,private type=scratch #DW persistentdw name=mytestbuffer #DW swap 5MB -#DW stage_in source=/usr/local/bin/dacd /global/cscratch1/filename1 destination=\$DW_JOB_STRIPED/filename1 type=file +#DW stage_in source=/usr/local/bin/dacd destination=\$DW_JOB_STRIPED/filename1 type=file #DW stage_out source=\$DW_JOB_STRIPED/outdir destination=/tmp type=directory env From 8e1f08c154f88a4c3192a370df23e666aee700fd Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Thu, 3 Jan 2019 09:24:10 +0000 Subject: [PATCH 12/12] Add array job in slurm test script --- tools/slurm-test.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/slurm-test.sh b/tools/slurm-test.sh index 693e9172..3d12d3ac 100755 --- a/tools/slurm-test.sh +++ b/tools/slurm-test.sh @@ -57,6 +57,8 @@ su centos -c 'sbatch use-persistent.sh' su centos -c 'sbatch use-persistent.sh' su centos -c 'sbatch use-persistent.sh' su centos -c 'sbatch use-persistent.sh' +su centos -c 'sbatch --array=1-10 test-persistent.sh' + squeue sleep $SLEEP_INTERVAL