-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMultipart.py
183 lines (157 loc) · 7.37 KB
/
Multipart.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
##########################################################################################
# Author: Sascha Johannes Date: March 2018
#
# This script splits a file into parts and uploads these parts via
# S3 multipart upload into IBM Cloud Object Storage
#
# Step 1 - SPLIT FILE:
# Split File Into Parts (temporary saved in ../temp_folder)
#
# Step 2 - MULTIPART UPLOAD:
# Phase 1 = IBM COS - Initiate IBM COS Multipart Upload
# Phase 2 = IBM COS - Upload Parts
# Phase 3 = IBM COS - Complete Upload
# Phase 4 = System Clean-Up (delete temp_folder)
#
# Prereqs:
# - Prereq to run the script is a connection to an existing COS service instance (incl. authorization);
# the authorization method of AWS CLI is easy to install and easy to configure
# (1. install AWS CLI from the AWS internet page; 2. run AWS CONFIGURE and enter the credentials for your COS bucket)
# - the imported modules must be available on your system
# - this script has to be in the same folder than the file you want to upload
#
# Preparations:
# - check/adapt IBM COS URL (endpoint)
# - enter Bucket-Name (bucket_name)
#
# Notes:
# - this script was written and tested on Windows 10 with Pythons 3.6.4
# - in a production environment, some more error handling would be required; this script is
# just a simple to use sample to demonstrate the advantages and general process of multipart-upload
# - to demonstrate the time savings, upload a midsize file (e.g. 30MB) with this script and do the
# same via a "normal" PUT script/command and compare the run times
# - I am an absolute beginner in the area of "scripting", so please excuse any uncommon or awkward programming style ;o)
#
##########################################################################################
import ibm_boto3
import sys
import os
import glob
import threading
endpoint = 'https://s3.eu-geo.objectstorage.softlayer.net' # IBM COS endpoint URL
bucket_name = 'PLEASE ENTER YOUR BUCKET NAME' # name of the IBM COS target bucket
if bucket_name == 'PLEASE ENTER YOUR BUCKET NAME': # check if bucket_name has been changed
print()
print('-- please enter a valid bucket name in line 43 of the script --')
sys.exit()
###########################################
######## Step 1 - SPLIT FILE ##############
###########################################
kilobytes = 1024
megabytes = kilobytes * 1000
chunksize = int(6 * megabytes) # minimum part size is 6MB
script_dir = os.path.dirname(os.path.realpath("__file__")) # script folder
def split(file, chunksize=chunksize):
partnum = 0
old_parts = (os.path.join(script_dir, 'temp_folder\part*')) # folder with old parts
input = open(file, 'rb') # use binary mode on Windows
if not os.path.exists(os.path.join(script_dir, "temp_folder")): # if temp_folder does not exist
os.mkdir('temp_folder') # create temp_folder to save parts
else: # if temp_folder already exists
for parts in os.listdir("temp_folder/"): # deletes old parts if existing
os.remove(os.path.join("temp_folder/", parts)) # deletes old parts if existing
while 1: # save parts in temp_folder
chunk = input.read(chunksize) # get next part <= chunksize
if not chunk: break
partnum = partnum+1
filename = os.path.join("temp_folder/", ('part%04d' % partnum))
fileobj = open(filename, 'wb')
fileobj.write(chunk)
fileobj.close() # or simply open( ).write( )
input.close( )
assert partnum <= 9999 # join sort fails if 5 digits
return partnum
##### MAIN ########
os.system('cls')
print()
v = 0
directory_list = glob.glob('*') # read all filenames within script videofolder
while v == 0:
print('File to be uploaded?')
print("or 'exit'")
print()
file = input('Filename: ') # input Filename
if file not in directory_list and file != 'exit': # check if entered file does exist
os.system('cls')
print()
print('-- File must exist in the current folder --')
print()
elif file == 'exit': # terminate script if 'exit' has been entered
sys.exit()
else: # continue with split process
v=1
try:
parts = split(file, chunksize) # run split function
except:
print (sys.exc_type, sys.exc_value) # error message in case of a problem during split
else:
print('----------------')
print ('Split finished:', parts, 'parts are created') # output number of created parts
###########################################
####### Step 2 - MULTIPART UPLOAD #########
###########################################
client = ibm_boto3.client('s3', endpoint_url=endpoint) # create IBM COS client instance
### PHASE1: INITIATE ###
def initiate_mu():
response = client.create_multipart_upload(
Bucket='1158019-3',
Key=file
)
upload_id = response["UploadId"]
return upload_id
### PHASE2: UPLOAD ###
upload_id=initiate_mu()
etag_dic = {}
new_parts = glob.glob(os.path.join(script_dir, 'temp_folder\part*')) # creates a list of the parts
parts_counter = len(new_parts) # count number of parts
def upload(num):
filename = 'temp_folder\part%04d' % num
print('upload started for: part%04d' % num)
upload_part = open(os.path.join(script_dir, filename), 'rb') # open the upload_part
response = client.upload_part( # S3 command to upload one part of the file
Body=upload_part, # S3 command to upload one part of the file
Bucket=bucket_name, # S3 command to upload one part of the file
Key=file, # S3 command to upload one part of the file
PartNumber=num, # S3 command to upload one part of the file
UploadId=upload_id) # S3 command to upload one part of the file
etag_dic[num] = response['ETag'] # writes the returned ETag-id into a list; needed in phase 3
upload_part.close() # close upload_part
print('upload finished for: part%04d' % num)
print('----------------')
threads = []
for t in range(1,parts_counter+1): # create threads according number of parts
thread=threading.Thread(target=upload, args=(t,)) # create threads according number of parts
threads += [thread] # create threads according number of parts
thread.start() # create threads according number of parts
print('----------------')
for x in threads: # wait until all threads are finished
x.join() # wait until all threads are finished
### PHASE3: COMPLETE UPLOAD ###
parts_list=[]
for m in range(0,parts_counter):
parts_list.append({'ETag':etag_dic[m+1],'PartNumber':m+1}) # creates the list of the uploaded parts and their corresponding ETag-id
response = client.complete_multipart_upload( #S3 command to complete the multipart upload process
Bucket='1158019-3',
Key=file,
MultipartUpload={'Parts': parts_list},
UploadId=upload_id
)
### PHASE4: SYSTEM CLEAN-UP ###
for parts in os.listdir('temp_folder/'): # delete parts
os.remove(os.path.join('temp_folder/', parts)) # delete parts
os.rmdir('temp_folder/') # delete temp_folder
print('----------------')
print('All temp data has been deleted from local disk')
print()
print('----- END OF SCRIPT -----')
############ END OF SCRIPT ##################