I'm using pytube library to download videos. Locally I was directing the output_path variable in the download function of pytube to a place where I desired the video to be downloaded on my local system. However, now since I'm using the AWS Free EC2 instance which comes with no storage, I am clueless about how to directly download the video to an s3 bucket. Providing the s3 uri in the output path is not working as it tries to search that location on the local system.
I tried mounting the s3 bucket and then setting the output_path variable to mounting location but still, it did not download the videos. the scripted executed w/o any errors and without downloading the videos.
Is there any solution that lets one download youtube videos using pytube directly to an s3 bucket? Your solution is much appreciated!
Past code: (save files locally)
# function safe file name
def safe_filename(s: str, max_length: int = 255) -> str:
# Characters in range 0-31 (0x00-0x1F) are not allowed in ntfs filenames.
ntfs_characters = [chr(i) for i in range(0, 31)]
characters = [r'"',r"\#",r"\$",r"\%",r"'",r"\*",r"\,",r"\.",r"\/",r"\:",r'"',r"\;",r"\<",r"\>",r"\?",r"\\",r"\^",r"\|",r"\~",r"\\\\",r"(",r")"]
pattern = "|".join(ntfs_characters + characters)
regex = re.compile(pattern, re.UNICODE)
filename = regex.sub("", s)
return filename[:max_length].rsplit(" ", 0)[0]
# function simplify name
def simplify(text):
try:
text = unicode(text, 'utf-8')
except NameError:
pass
text = unicodedata.normalize('NFD', text.decode('utf-8')).encode('ascii', 'ignore').decode("utf-8")
return str(text)
def downloader(lnks, threadlabel):
failed=[]
#holds links so that we can retry to download those we missed
for link in lnks:
try:
yt = YouTube(link)
#print(yt.title)
name= safe_filename(yt.title)
name= name.strip()
name= name.encode("utf-8")
name= simplify(name)
name= re.sub('[^A-Za-z0-9]+', '', name)
name= name.replace('[^\w\s]', '')
print(name)
if name + '.mp4' not in os.listdir(r'D:\OneDrive - Indian School of Business\Projects\CEO_Mindset_Master\Data_Repository\YT_Interviews_Data\Videos'):
print(yt.streams.order_by('resolution').desc())
print('----------------------------------------------------------------------------------------')
print(yt.streams.filter(progressive=True, file_extension='mp4', type="video").order_by('resolution').desc().first())
yt.streams.filter(progressive=True, file_extension='mp4', type="video").order_by('resolution').desc().first().download(output_path=r'Videos', filename= name)
else:
print(name, ' done')
pass
except:
print('----------------------', link, 'not downloaded','----------------------')
failed.append(link)
break
if len(failed)!=0:
print(failed)
with open('failed_'+threadlabel+'.txt', 'w+') as f:
f.write(failed)
downloader(['https://www.youtube.com/watch?v=Pk4tE6Jfakg'], 'test')
code: output filepath s3 URI:
yt.streams.filter(progressive=True, file_extension='mp4', type="video").order_by('resolution').desc().first().download(output_path=S3_URI, filename= name)
code: output filepath mounted s3 location
yt.streams.filter(progressive=True, file_extension='mp4', type="video").order_by('resolution').desc().first().download(output_path='mnt/bucket/folder', filename= name)