syllabus/app/download.py
2025-04-27 18:54:51 -04:00

251 lines
9.5 KiB
Python

import os, yt_dlp, json, requests, re, time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from urllib.parse import urlsplit
class grab():
def season(url):
page_html=requests.get(url)
soup = BeautifulSoup(page_html.text, 'html.parser')
select_element = soup.find('select', class_='js-switch-season')
options = select_element.find_all('option')
option_values = [option['value'] for option in options if option.has_attr('value')]
seasons = [item.replace(url+'/season:', '') for item in option_values]
return seasons
def poster(url, name, save_dir='/data/posters/', force_download=False):
# Use alt for filename if available, fallback to a generic name
alt_value = name
path = urlsplit(url).path
ext = os.path.splitext(path)[-1] or '.jpeg'
safe_name = re.sub(r'[^a-zA-Z0-9\s]', '', alt_value).replace(' ', '_')
filename = f"{safe_name}{ext}"
filepath = os.path.join(save_dir, filename)
if not os.path.exists(filepath) or force_download:
os.makedirs(save_dir, exist_ok=True)
img_data = requests.get(url).content
with open(filepath, 'wb') as handler:
handler.write(img_data)
return filepath
def thumbnail(ydl,url,location):
# Extracting video information
video_info = ydl.extract_info(url, download=False)
thumbnail_url = video_info.get('thumbnail')
# Download the thumbnail image
if thumbnail_url:
try:
thumbnail_filename = os.path.join(location, f"{video_info['id']}.jpg")
with open(thumbnail_filename, 'wb') as thumbnail_file:
thumbnail_file.write(requests.get(thumbnail_url).content)
print("Downloaded MP4 and downloaded thumbnail successfully!")
except Exception as e:
print(f"Error downloading thumbnail: {str(e)}")
else:
print("Downloaded MP4 but no thumbnail found.")
class dropout():
def show(show, season, episode_start):
directory = f'/tv/{show}/Season {season}/'
if not os.path.exists(directory):
os.makedirs(directory)
with open('/data/dropout.json', 'r') as json_file:
url_mapping = json.load(json_file)
url = next((item['URL'] for item in url_mapping if item['SHOW'] == show), None)
if url is None:
raise ValueError(f"Show '{show}' not found in the JSON data.")
playlist_url = f'{url}/season:{season}'
# Create match_filter
filter_pattern = (
"title !~= "
r"'(?i).*behind.?the.?scenes.*"
r"|.*trailer.*"
r"|.*recap.*"
r"|.*last.looks.*'"
)
match_filter = yt_dlp.utils.match_filter_func(filter_pattern)
ydl_opts = {
'quiet': True,
'skip_download': True,
'cookiefile': '/data/dropout.cookies.txt',
}
# Step 1: Extract playlist info
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
playlist_info = ydl.extract_info(playlist_url, download=False)
entries = playlist_info.get('entries', [])
filtered_entries = []
for entry in entries:
if match_filter(entry) is None: # Not filtered out
filtered_entries.append(entry)
# Step 2: Download filtered entries with corrected episode numbers
episode_start = int(episode_start) if episode_start else 1
for i, entry in enumerate(filtered_entries, start=episode_start):
episode_number = f"{i:02}"
filename_template = f"{show} - S{int(season):02}E{episode_number} - %(title)s.%(ext)s"
dl_opts = {
'format': 'bestvideo+bestaudio/best',
'audio_quality': '256K',
'paths': {
'temp': '/temp',
'home': directory
},
'cookiefile': '/data/dropout.cookies.txt',
'writesubtitles': True,
'subtitleslangs': ['en'],
'outtmpl': filename_template,
}
with yt_dlp.YoutubeDL(dl_opts) as ydl:
ydl.download([entry['webpage_url']])
def specials(show, season, episode_start):
directory = f'/tv/{show}/Specials/'
if not os.path.exists(directory):
os.makedirs(directory)
with open('/data/dropout.json', 'r') as json_file:
url_mapping = json.load(json_file)
url = next((item['URL'] for item in url_mapping if item['SHOW'] == show), None)
if url is None:
raise ValueError(f"Show '{show}' not found in the JSON data.")
playlist_url = f'{url}/season:{season}'
# Create match_filter
filter_pattern = (
"title ~= "
r"'(?i).*behind.?the.?scenes.*"
r"|.*trailer.*"
r"|.*recap.*"
r"|.*last.looks.*'"
)
match_filter = yt_dlp.utils.match_filter_func(filter_pattern)
ydl_opts = {
'quiet': True,
'skip_download': True,
'cookiefile': '/data/dropout.cookies.txt',
}
# Step 1: Extract playlist info
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
playlist_info = ydl.extract_info(playlist_url, download=False)
entries = playlist_info.get('entries', [])
filtered_entries = []
for entry in entries:
if match_filter(entry) is None: # Not filtered out
filtered_entries.append(entry)
# Step 2: Download filtered entries with corrected episode numbers
episode_start = int(episode_start) if episode_start else 1
for i, entry in enumerate(filtered_entries, start=episode_start):
episode_number = f"{i:02}"
filename_template = f"{show} - S00E{episode_number} - %(title)s.%(ext)s"
dl_opts = {
'format': 'bestvideo+bestaudio/best',
'audio_quality': '256K',
'paths': {
'temp': '/temp',
'home': directory
},
'cookiefile': '/data/dropout.cookies.txt',
'writesubtitles': True,
'subtitleslangs': ['en'],
'outtmpl': filename_template,
}
with yt_dlp.YoutubeDL(dl_opts) as ydl:
ydl.download([entry['webpage_url']])
def series():
json_data=[]
# driver = webdriver.Chrome(executable_path='/path/to/chromedriver')
# driver.get('https://www.dropout.tv/series')
# for _ in range(5): # Adjust the range as needed
# driver.find_element_by_tag_name('body').send_keys(Keys.END)
# time.sleep(2) # Wait for new content to load
# html = driver.page_source
html=requests.get('https://www.dropout.tv/series').text
# If you want to parse the HTML
soup = BeautifulSoup(html, 'html.parser')
elements = soup.find_all('a', class_='browse-item-link')
shows = []
for element in elements:
show_data = {}
show_data['href'] = element.get('href', '')
img = element.find('img')
if img:
show_data['src'] = img.get('src', '')
show_data['alt'] = img.get('alt', '')
shows.append(show_data)
# Now 'shows' is a list of dicts, so this works:
for show in shows:
info_data = {}
info_data['SHOW'] = show.get('alt', 'No title')
info_data['URL'] = show.get('href', 'No link')
info_data['LINK'] = re.sub(r".*dropout.tv/", "", show.get('href', ''))
info_data['POSTER'] = grab.poster(show.get('src', ''), show.get('alt', ''))
json_data.append(info_data)
# Sort the json_data by the 'SHOW' key
sorted_json_data = sorted(json_data, key=lambda x: x['SHOW'])
with open('./data/dropout.json', 'w') as json_file:
json.dump(sorted_json_data, json_file, indent=4)
class youtube():
def ydl(url, location):
dl_ops = {'paths': {'temp': '/temp', 'home': location}, 'outtmpl': '%(uploader)s/%(title)s.%(ext)s'}
if dl_ops['paths']['home'] == '/podcasts':
dl_ops['format'] = 'bestaudio/best[ext=mp3]'
dl_ops['postprocessors'] = [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}, {
'key': 'FFmpegMetadata',
'add_metadata': True,
}]
elif dl_ops['paths']['home'] == '/asmr':
dl_ops['format'] = 'bestaudio/best[ext=mp3]'
dl_ops['postprocessors'] = [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}, {
'key': 'FFmpegMetadata',
'add_metadata': True,
}]
elif dl_ops['paths']['home'] == '/youtube':
dl_ops['format'] = 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best'
dl_ops['cookiefile'] = '/data/youtube.cookies.txt'
else:
dl_ops['format'] = 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best'
with yt_dlp.YoutubeDL(dl_ops) as ydl:
ydl.download([url])
# grab.thumbnail(ydl,url,location)