File indexing completed on 2024-03-24 05:28:22
0001 import bs4 0002 import re 0003 import json 0004 import datetime 0005 from .session.session import session 0006 0007 class YoutubeSearcher: 0008 def __init__(self, location_code=None, user_agent=None): 0009 if location_code: 0010 self.location_code = location_code 0011 else: 0012 self.location_code = "US" 0013 0014 # TODO make compatibile with mobile user_agents 0015 if user_agent: 0016 self.user_agent = user_agent 0017 else: 0018 self.user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36" 0019 0020 self.base_url = "https://www.youtube.com" 0021 self.headers = { 0022 'User-Agent': self.user_agent 0023 } 0024 self.featured_channel = {"videos": [], "playlists": []} 0025 self.data = {} 0026 self.videos = [] 0027 self.playlists = [] 0028 self.related_to_search = [] 0029 self.related_queries = [] 0030 self.radio = [] 0031 self.movies = [] 0032 self.promoted = [] 0033 self.videos_on_page = [] 0034 self.corrected_query = None 0035 self.contents = None 0036 self.primary_contents = None 0037 self.secondary_contents = None 0038 self.primary_contents_page = None 0039 0040 def search_youtube(self, query, render="all"): 0041 self.featured_channel = {"videos": [], "playlists": []} 0042 self.data = {} 0043 self.videos = [] 0044 self.playlists = [] 0045 self.related_to_search = [] 0046 self.related_queries = [] 0047 self.radio = [] 0048 self.movies = [] 0049 self.promoted = [] 0050 self.videos_on_page = [] 0051 self.corrected_query = None 0052 self.contents = None 0053 self.primary_contents = None 0054 self.secondary_contents = None 0055 self.primary_contents_page = None 0056 0057 params = {"search_query": query, 0058 "gl": self.location_code} 0059 0060 # TODO dont cache if no results found 0061 html = session.get(self.base_url + "/results", cookies={'CONSENT': 'YES+42'}, 0062 headers=self.headers, params=params).text 0063 soup = bs4.BeautifulSoup(html, 'html.parser') 0064 results = self.santize_soup_result(soup) 0065 data = {"query": query, "corrected_query": query} 0066 0067 contents = results['contents']['twoColumnSearchResultsRenderer'] 0068 0069 content_checker = contents["primaryContents"]["sectionListRenderer"]["contents"][0]['itemSectionRenderer']['contents'] 0070 if "shelfRenderer" in content_checker: 0071 self.primary_contents = contents["primaryContents"]["sectionListRenderer"]["contents"][0]['itemSectionRenderer']['contents'][0]['shelfRenderer']['content']['verticalListRenderer']['items'] 0072 else: 0073 self.primary_contents = contents["primaryContents"]["sectionListRenderer"]["contents"][0]['itemSectionRenderer']['contents'] 0074 0075 self.contents = contents 0076 0077 if render == "all": 0078 self.prepare_feature_channel_info() 0079 self.prepare_videos_info() 0080 self.prepare_playlistRender_info() 0081 self.prepare_horizontalCardList_info() 0082 self.prepare_radioRenderer_info() 0083 self.prepare_movieRenderer_info() 0084 self.prepare_carouselAdRenderer_info() 0085 self.prepare_autoCorrectedQuery_info() 0086 self.prepare_searchPyRenderer_info() 0087 self.filter_for_secondaryContents() 0088 0089 self.data["videos"] = self.videos 0090 self.data["playlists"] = self.playlists 0091 self.data["featured_channel"] = self.featured_channel 0092 self.data["related_videos"] = self.related_to_search 0093 self.data["related_queries"] = self.related_queries 0094 self.data["full_movies"] = self.movies 0095 self.data["promoted"] = self.promoted 0096 0097 if render == "featured": 0098 self.prepare_feature_channel_info() 0099 self.prepare_videos_info() 0100 self.filter_for_secondaryContents() 0101 self.data["featured_channel"] = self.featured_channel 0102 0103 if render == "videos": 0104 self.prepare_videos_info() 0105 self.data["videos"] = self.videos 0106 0107 if render == "related": 0108 self.prepare_videos_info() 0109 self.prepare_horizontalCardList_info() 0110 self.data["related_videos"] = self.related_to_search 0111 self.data["related_queries"] = self.related_queries 0112 0113 return self.data 0114 0115 def page_search(self, page_type="trending"): 0116 params = {"gl": self.location_code} 0117 0118 # TODO dont cache if no results found 0119 if page_type == "news": 0120 page = "news" 0121 elif page_type == "music": 0122 page = "music" 0123 elif page_type == "entertainment": 0124 page = "entertainment" 0125 else: 0126 page = "feed/trending" 0127 0128 html = session.get(self.base_url + "/" + page, cookies={'CONSENT': 'YES+42'}, 0129 headers=self.headers, params=params).text 0130 soup = bs4.BeautifulSoup(html, 'html.parser') 0131 #print(soup) 0132 results = self.santize_soup_result(soup) 0133 0134 contents = results['contents']['twoColumnBrowseResultsRenderer'] 0135 self.primary_contents_page = contents['tabs'][0]['tabRenderer']['content'][ 0136 'sectionListRenderer']['contents'] 0137 0138 if page == "feed/trending": 0139 self.prepare_pageTrending_info() 0140 else: 0141 self.prepare_pageRequested_info() 0142 0143 self.data["page_videos"] = self.videos_on_page 0144 0145 return self.data 0146 0147 def watchlist_search(self, video_id=None): 0148 related_vids_on_page = [] 0149 params = {"gl": self.location_code} 0150 base_url = "https://www.youtube.com/watch?v=" 0151 html = session.get(base_url + video_id, cookies={'CONSENT': 'YES+42'}, 0152 headers=self.headers, params=params).text 0153 soup = bs4.BeautifulSoup(html, 'html.parser') 0154 results = self.santize_soup_result(soup) 0155 contents = results['contents']['twoColumnWatchNextResults']['secondaryResults']['secondaryResults']['results'] 0156 for x in range(len(contents)): 0157 if "compactVideoRenderer" in contents[x]: 0158 vid = contents[x]["compactVideoRenderer"] 0159 thumb = vid["thumbnail"]['thumbnails'] 0160 0161 #Get video view count or live watch count 0162 if "simpleText" in vid["shortViewCountText"]: 0163 views = vid["shortViewCountText"]["simpleText"] 0164 else: 0165 views = vid["shortViewCountText"]["runs"][0]["text"] + " " + vid["shortViewCountText"]["runs"][1]["text"] 0166 0167 #Get video published_time assume if not available video is Live 0168 if "publishedTimeText" in vid: 0169 published_time = vid["publishedTimeText"]["simpleText"] 0170 else: 0171 published_time = "Live" 0172 0173 title = vid["title"]["simpleText"] 0174 0175 if 'descriptionSnippet' in vid: 0176 desc = " ".join([ 0177 r["text"] for r in vid['descriptionSnippet']["runs"]]) 0178 else: # ocasionally happens 0179 desc = title 0180 0181 #Length filter for live video 0182 if "lengthText" in vid: 0183 length_caption = \ 0184 vid["lengthText"]['accessibility']["accessibilityData"][ 0185 "label"] 0186 length_txt = vid["lengthText"]['simpleText'] 0187 else: 0188 length_caption = "Live" 0189 length_txt = "Live" 0190 0191 if "longBylineText" in vid: 0192 owner_txt = vid["longBylineText"]["runs"][0]["text"] 0193 0194 videoId = vid['videoId'] 0195 url = \ 0196 vid['navigationEndpoint']['commandMetadata'][ 0197 'webCommandMetadata']['url'] 0198 0199 related_vids_on_page.append( 0200 { 0201 "url": base_url + vid['videoId'], 0202 "title": title, 0203 "length": length_txt, 0204 "length_human": length_caption, 0205 "views": views, 0206 "published_time": published_time, 0207 "videoId": videoId, 0208 "thumbnails": thumb, 0209 "description": desc, 0210 "channel_name": owner_txt 0211 } 0212 ) 0213 0214 0215 self.data["watchlist_videos"] = related_vids_on_page 0216 return self.data 0217 0218 def santize_soup_result(self, soup_blob): 0219 # Make sure we always get the correct blob and santize it 0220 blob = soup_blob.find('script', text=re.compile("ytInitialData")) 0221 #print(blob) 0222 json_data = str(blob)[str(blob).find('{\"responseContext\"'):str(blob).find('module={}')] 0223 json_data = re.split(r"\};", json_data)[0] 0224 #print(json_data) 0225 results = json.loads(json_data+"}") 0226 return results 0227 0228 def prepare_feature_channel_info(self): 0229 # because order is not assured we need to make 2 passes over the data 0230 for vid in self.primary_contents: 0231 if 'channelRenderer' in vid: 0232 vid = vid['channelRenderer'] 0233 user = \ 0234 vid['navigationEndpoint']['commandMetadata']['webCommandMetadata'][ 0235 'url'] 0236 0237 self.featured_channel["title"] = vid["title"]["simpleText"] 0238 0239 if 'descriptionSnippet' in vid: 0240 d = [r["text"] for r in vid['descriptionSnippet']["runs"]] 0241 else: 0242 d = vid["title"]["simpleText"].split(" ") 0243 0244 self.featured_channel["description"] = " ".join(d) 0245 self.featured_channel["user_url"] = self.base_url + user 0246 0247 def prepare_videos_info(self): 0248 for vid in self.primary_contents: 0249 if 'videoRenderer' in vid: 0250 vid = vid['videoRenderer'] 0251 thumb = vid["thumbnail"]['thumbnails'] 0252 0253 if "shortViewCountText" in vid: 0254 #Get video view count or live watch count 0255 if "simpleText" in vid["shortViewCountText"]: 0256 views = vid["shortViewCountText"]["simpleText"] 0257 else: 0258 views = vid["shortViewCountText"]["runs"][0]["text"] + " " + vid["shortViewCountText"]["runs"][1]["text"] 0259 else: 0260 views = " " 0261 0262 #Get video published_time assume if not available video is Live 0263 if "publishedTimeText" in vid: 0264 published_time = vid["publishedTimeText"]["simpleText"] 0265 else: 0266 published_time = "Live" 0267 0268 title = " ".join([r["text"] for r in vid['title']["runs"]]) 0269 if 'descriptionSnippet' in vid: 0270 desc = " ".join([ 0271 r["text"] for r in vid['descriptionSnippet']["runs"]]) 0272 else: # ocasionally happens 0273 desc = title 0274 0275 #Length filter for live video 0276 if "lengthText" in vid: 0277 length_caption = \ 0278 vid["lengthText"]['accessibility']["accessibilityData"][ 0279 "label"] 0280 length_txt = vid["lengthText"]['simpleText'] 0281 else: 0282 length_caption = "Live" 0283 length_txt = "Live" 0284 0285 videoId = vid['videoId'] 0286 url = \ 0287 vid['navigationEndpoint']['commandMetadata'][ 0288 'webCommandMetadata']['url'] 0289 0290 if "ownerText" in vid: 0291 owner_txt = vid["ownerText"]["runs"][0]["text"] 0292 0293 self.videos.append( 0294 { 0295 "url": self.base_url + url, 0296 "title": title, 0297 "length": length_txt, 0298 "length_human": length_caption, 0299 "views": views, 0300 "published_time": published_time, 0301 "videoId": videoId, 0302 "thumbnails": thumb, 0303 "description": desc, 0304 "channel_name": owner_txt 0305 } 0306 ) 0307 elif 'shelfRenderer' in vid: 0308 entries = vid['shelfRenderer'] 0309 #most recent from channel {title_from_step_above} 0310 #related to your search 0311 0312 if "simpleText" in entries["title"]: 0313 category = entries["title"]["simpleText"] 0314 else: 0315 category = entries["title"]["runs"][0]["text"] 0316 0317 #TODO category localization 0318 #this comes in lang from your ip address 0319 #not good to use as dict keys, can assumptions be made about 0320 #ordering and num of results? last item always seems to be 0321 #related artists and first (if any) featured channel 0322 ch = self.featured_channel.get("title", "") 0323 0324 for vid in entries["content"]["verticalListRenderer"]['items']: 0325 vid = vid['videoRenderer'] 0326 thumb = vid["thumbnail"]['thumbnails'] 0327 d = [r["text"] for r in vid['title']["runs"]] 0328 title = " ".join(d) 0329 0330 #Get video view count or live watch count 0331 if "simpleText" in vid["shortViewCountText"]: 0332 views = vid["viewCountText"]["simpleText"] 0333 else: 0334 views = vid["shortViewCountText"]["runs"][0]["text"] + " " + vid["shortViewCountText"]["runs"][1]["text"] 0335 0336 if "publishedTimeText" in vid: 0337 published_time = vid["publishedTimeText"]["simpleText"] 0338 else: 0339 published_time = "Live" 0340 0341 #Length filter for live video 0342 if "lengthText" in vid: 0343 length_caption = \ 0344 vid["lengthText"]['accessibility']["accessibilityData"][ 0345 "label"] 0346 length_txt = vid["lengthText"]['simpleText'] 0347 else: 0348 length_caption = "Live" 0349 length_txt = "Live" 0350 0351 if "ownerText" in vid: 0352 owner_txt = vid["ownerText"]["runs"][0]["text"] 0353 0354 videoId = vid['videoId'] 0355 url = vid['navigationEndpoint']['commandMetadata'][ 0356 'webCommandMetadata']['url'] 0357 if ch and category.endswith(ch): 0358 self.featured_channel["videos"].append( 0359 { 0360 "url": self.base_url + url, 0361 "title": title, 0362 "length": length_txt, 0363 "length_human": length_caption, 0364 "views": views, 0365 "published_time": published_time, 0366 "videoId": videoId, 0367 "thumbnails": thumb, 0368 "channel_name": owner_txt 0369 } 0370 ) 0371 else: 0372 self.related_to_search.append( 0373 { 0374 "url": self.base_url + url, 0375 "title": title, 0376 "length": length_txt, 0377 "length_human": length_caption, 0378 "views": views, 0379 "published_time": published_time, 0380 "videoId": videoId, 0381 "thumbnails": thumb, 0382 "reason": category, 0383 "channel_name": owner_txt 0384 } 0385 ) 0386 0387 def prepare_playlistRender_info(self): 0388 for vid in self.primary_contents: 0389 if 'playlistRenderer' in vid: 0390 vid = vid['playlistRenderer'] 0391 playlist = { 0392 "title": vid["title"]["simpleText"] 0393 } 0394 vid = vid['navigationEndpoint'] 0395 playlist["url"] = \ 0396 self.base_url + vid['commandMetadata']['webCommandMetadata']['url'] 0397 playlist["videoId"] = vid['watchEndpoint']['videoId'] 0398 playlist["playlistId"] = vid['watchEndpoint']['playlistId'] 0399 self.playlists.append(playlist) 0400 0401 def prepare_horizontalCardList_info(self): 0402 for vid in self.primary_contents: 0403 if 'horizontalCardListRenderer' in vid: 0404 for vid in vid['horizontalCardListRenderer']['cards']: 0405 vid = vid['searchRefinementCardRenderer'] 0406 url = \ 0407 vid['searchEndpoint']['commandMetadata'][ 0408 "webCommandMetadata"]["url"] 0409 self.related_queries.append({ 0410 "title": vid['searchEndpoint']['searchEndpoint']["query"], 0411 "url": self.base_url + url, 0412 "thumbnails": vid["thumbnail"]['thumbnails'] 0413 }) 0414 0415 def prepare_radioRenderer_info(self): 0416 for vid in self.primary_contents: 0417 if 'radioRenderer' in vid: 0418 vid = vid['radioRenderer'] 0419 title = vid["title"]["simpleText"] 0420 thumb = vid["thumbnail"]['thumbnails'] 0421 vid = vid['navigationEndpoint'] 0422 url = vid['commandMetadata']['webCommandMetadata']['url'] 0423 videoId = vid['watchEndpoint']['videoId'] 0424 playlistId = vid['watchEndpoint']['playlistId'] 0425 self.radio.append({ 0426 "title": title, 0427 "thumbnails": thumb, 0428 "url": self.base_url + url, 0429 "videoId": videoId, 0430 "playlistId": playlistId 0431 }) 0432 0433 def prepare_movieRenderer_info(self): 0434 for vid in self.primary_contents: 0435 if 'movieRenderer' in vid: 0436 vid = vid['movieRenderer'] 0437 title = " ".join([r["text"] for r in vid['title']["runs"]]) 0438 thumb = vid["thumbnail"]['thumbnails'] 0439 videoId = vid['videoId'] 0440 meta = vid['bottomMetadataItems'] 0441 meta = [m["simpleText"] for m in meta] 0442 desc = " ".join([r["text"] for r in vid['descriptionSnippet']["runs"]]) 0443 url = vid['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] 0444 0445 movies.append({ 0446 "title": title, 0447 "thumbnails": thumb, 0448 "url": self.base_url + url, 0449 "videoId": videoId, 0450 "metadata": meta, 0451 "description": desc 0452 }) 0453 0454 def prepare_carouselAdRenderer_info(self): 0455 for vid in self.primary_contents: 0456 if 'carouselAdRenderer' in vid: 0457 vid = vid["carouselAdRenderer"] 0458 # skip ads 0459 0460 def prepare_autoCorrectedQuery_info(self): 0461 for vid in self.primary_contents: 0462 if 'showingResultsForRenderer' in vid: 0463 q = vid['showingResultsForRenderer']['correctedQuery'] 0464 self.corrected_query = " ".join([r["text"] for r in q["runs"]]) 0465 0466 def prepare_searchPyRenderer_info(self): 0467 for vid in self.primary_contents: 0468 if 'searchPyvRenderer' in vid: 0469 for entry in vid['searchPyvRenderer']['ads']: 0470 entry = entry['promotedVideoRenderer'] 0471 desc = entry["description"]['simpleText'] 0472 title = entry['longBylineText']['runs'][0]["text"] 0473 url = self.base_url + entry['longBylineText']['runs'][0][ 0474 'navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'] 0475 self.promoted.append({ 0476 "title": title, 0477 "description": desc, 0478 "url": url 0479 }) 0480 0481 def filter_for_secondaryContents(self): 0482 if self.contents.get("secondaryContents"): 0483 self.secondary_contents = \ 0484 self.contents["secondaryContents"]["secondarySearchContainerRenderer"][ 0485 "contents"][0]["universalWatchCardRenderer"] 0486 self.prepare_secondaryContentsRender() 0487 0488 0489 def prepare_secondaryContentsRender(self): 0490 for vid in self.secondary_contents["sections"]: 0491 entries = vid['watchCardSectionSequenceRenderer'] 0492 for entry in entries['lists']: 0493 if 'verticalWatchCardListRenderer' in entry: 0494 for vid in entry['verticalWatchCardListRenderer']["items"]: 0495 vid = vid['watchCardCompactVideoRenderer'] 0496 thumbs = vid['thumbnail']['thumbnails'] 0497 0498 d = [r["text"] for r in vid['title']["runs"]] 0499 title = " ".join(d) 0500 url = vid['navigationEndpoint']['commandMetadata'][ 0501 'webCommandMetadata']['url'] 0502 videoId = vid['navigationEndpoint']['watchEndpoint'][ 0503 'videoId'] 0504 playlistId = \ 0505 vid['navigationEndpoint']['watchEndpoint']['playlistId'] 0506 length_caption = \ 0507 vid["lengthText"]['accessibility'][ 0508 "accessibilityData"]["label"] 0509 length_txt = vid["lengthText"]['simpleText'] 0510 0511 #TODO investigate 0512 #These seem to always be from featured channel 0513 #playlistId doesnt match any extracted playlist 0514 self.featured_channel["videos"].append({ 0515 "url": self.base_url + url, 0516 "title": title, 0517 "length": length_txt, 0518 "length_human": length_caption, 0519 "videoId": videoId, 0520 "playlistId": playlistId, 0521 "thumbnails": thumbs 0522 }) 0523 elif 'horizontalCardListRenderer' in entry: 0524 for vid in entry['horizontalCardListRenderer']['cards']: 0525 vid = vid['searchRefinementCardRenderer'] 0526 playlistId = \ 0527 vid['searchEndpoint']['watchPlaylistEndpoint'][ 0528 'playlistId'] 0529 thumbs = vid['thumbnail']['thumbnails'] 0530 url = vid['searchEndpoint']['commandMetadata'][ 0531 'webCommandMetadata']['url'] 0532 d = [r["text"] for r in vid['query']["runs"]] 0533 title = " ".join(d) 0534 self.featured_channel["playlists"].append({ 0535 "url": self.base_url + url, 0536 "title": title, 0537 "thumbnails": thumbs, 0538 "playlistId": playlistId 0539 }) 0540 0541 def prepare_pageTrending_info(self): 0542 for items in self.primary_contents_page: 0543 if 'itemSectionRenderer' in items: 0544 i_items = items['itemSectionRenderer']['contents'][0]['shelfRenderer']['content'] 0545 if 'expandedShelfContentsRenderer' in i_items: 0546 page_items = items['itemSectionRenderer']['contents'][0]['shelfRenderer']['content']['expandedShelfContentsRenderer']['items'] 0547 else: 0548 page_items = [] 0549 0550 for x in range(len(page_items)): 0551 if 'videoRenderer' in page_items[x]: 0552 vid = page_items[x]['videoRenderer'] 0553 thumb = vid["thumbnail"]['thumbnails'] 0554 0555 #Get video view count or live watch count 0556 try: 0557 if "simpleText" in vid["shortViewCountText"]: 0558 views = vid["shortViewCountText"]["simpleText"] 0559 else: 0560 views = vid["shortViewCountText"]["runs"][0]["text"] + " " + vid["shortViewCountText"]["runs"][1]["text"] 0561 except: 0562 views = "Live" 0563 0564 #Get video published_time assume if not available video is Live 0565 try: 0566 if "publishedTimeText" in vid: 0567 published_time = vid["publishedTimeText"]["simpleText"] 0568 else: 0569 published_time = "Live" 0570 except: 0571 published_time = "Now Streaming" 0572 0573 title = " ".join([r["text"] for r in vid['title']["runs"]]) 0574 0575 if 'descriptionSnippet' in vid: 0576 desc = " ".join([ 0577 r["text"] for r in vid['descriptionSnippet']["runs"]]) 0578 else: # ocasionally happens 0579 desc = title 0580 0581 #Length filter for live video 0582 if "lengthText" in vid: 0583 length_caption = \ 0584 vid["lengthText"]['accessibility']["accessibilityData"][ 0585 "label"] 0586 length_txt = vid["lengthText"]['simpleText'] 0587 else: 0588 length_caption = "Live" 0589 length_txt = "Live" 0590 0591 if "ownerText" in vid: 0592 owner_txt = vid["ownerText"]["runs"][0]["text"] 0593 0594 videoId = vid['videoId'] 0595 url = \ 0596 vid['navigationEndpoint']['commandMetadata'][ 0597 'webCommandMetadata']['url'] 0598 self.videos_on_page.append( 0599 { 0600 "url": self.base_url + url, 0601 "title": title, 0602 "length": length_txt, 0603 "length_human": length_caption, 0604 "views": views, 0605 "published_time": published_time, 0606 "videoId": videoId, 0607 "thumbnails": thumb, 0608 "description": desc, 0609 "channel_name": owner_txt 0610 } 0611 ) 0612 0613 def prepare_pageRequested_info(self): 0614 for items in self.primary_contents_page: 0615 if 'itemSectionRenderer' in items: 0616 page_items = items['itemSectionRenderer']['contents'][0]['shelfRenderer']['content']['horizontalListRenderer']['items'] 0617 for x in range(len(page_items)): 0618 if 'gridVideoRenderer' in page_items[x]: 0619 vid = page_items[x]['gridVideoRenderer'] 0620 thumb = vid["thumbnail"]['thumbnails'] 0621 0622 #Get video view count or live watch count 0623 if "shortViewCountText" in vid: 0624 if "simpleText" in vid["shortViewCountText"]: 0625 views = vid["shortViewCountText"]["simpleText"] 0626 else: 0627 views = vid["shortViewCountText"]["runs"][0]["text"] + " " + vid["shortViewCountText"]["runs"][1]["text"] 0628 else: 0629 views = "unavailable" 0630 0631 #Get video published_time assume if not available video is Live 0632 if "publishedTimeText" in vid: 0633 published_time = vid["publishedTimeText"]["simpleText"] 0634 else: 0635 published_time = "Live" 0636 0637 #title = " ".join([r["text"] for r in vid['title']["runs"]]) 0638 title = vid['title']['simpleText'] 0639 0640 if 'descriptionSnippet' in vid: 0641 desc = " ".join([ 0642 r["text"] for r in vid['descriptionSnippet']["runs"]]) 0643 else: # ocasionally happens 0644 desc = title 0645 0646 #Length filter for live video 0647 overlayInformation = vid['thumbnailOverlays'][0] 0648 if "thumbnailOverlayTimeStatusRenderer" in overlayInformation: 0649 length_caption = \ 0650 overlayInformation['thumbnailOverlayTimeStatusRenderer']['text']['accessibility']["accessibilityData"][ 0651 "label"] 0652 length_txt = overlayInformation['thumbnailOverlayTimeStatusRenderer']['text']['simpleText'] 0653 else: 0654 length_caption = "Live" 0655 length_txt = "Live" 0656 0657 videoId = vid['videoId'] 0658 url = \ 0659 vid['navigationEndpoint']['commandMetadata'][ 0660 'webCommandMetadata']['url'] 0661 self.videos_on_page.append( 0662 { 0663 "url": self.base_url + url, 0664 "title": title, 0665 "length": length_txt, 0666 "length_human": length_caption, 0667 "views": views, 0668 "published_time": published_time, 0669 "videoId": videoId, 0670 "thumbnails": thumb, 0671 "description": desc 0672 } 0673 ) 0674 0675 def extract_video_meta(self, url): 0676 params = {"gl": "US"} 0677 html = session.get(url, cookies={'CONSENT': 'YES+42'}, 0678 headers=self.headers, params=params).text 0679 soup = bs4.BeautifulSoup(html, 'html.parser') 0680 results = self.santize_soup_result(soup) 0681 contents = results['contents']['twoColumnWatchNextResults']['results']['results']['contents'][0]['videoPrimaryInfoRenderer'] 0682 secondaryContents = results['contents']['twoColumnWatchNextResults']['results']['results']['contents'][1]['videoSecondaryInfoRenderer'] 0683 title = contents['title']['runs'][0]['text'] 0684 try: 0685 viewCount = contents['viewCount']['videoViewCountRenderer']['viewCount']['simpleText'] 0686 except: 0687 viewCount = "Live" 0688 author = secondaryContents['owner']['videoOwnerRenderer']['title']['runs'][0]['text'] 0689 try: 0690 actualDate = contents['dateText']['simpleText'] + " 12:00AM" 0691 publishedDate = datetime.datetime.strptime(actualDate, '%d %b %Y %I:%M%p') 0692 except: 0693 publishedDate = "Live" 0694 0695 vidmetadata = { 0696 "title": title, 0697 "views": viewCount, 0698 "published_time": publishedDate, 0699 "channel_name": author 0700 } 0701 return vidmetadata