#!/usr/bin/env python3 # requires: selenium, chromium-driver, retry from selenium import webdriver from selenium.webdriver.chrome.service import Service from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By import selenium.common.exceptions as sel_ex import sys import time import urllib.parse from retry import retry import argparse import logging logging.basicConfig(stream=sys.stderr, level=logging.INFO) logger = logging.getLogger() retry_logger = None css_thumbnail = "img.YQ4gaf" css_large = "img.sFlh5c.FyHeAf.iPVvYb" tn_clickable_div_class= "IQHeM" # css_load_more = ".mye4qd" selenium_exceptions = (sel_ex.ElementClickInterceptedException, sel_ex.ElementNotInteractableException, sel_ex.StaleElementReferenceException) def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") @retry(exceptions=KeyError, tries=6, delay=0.1, backoff=2, logger=retry_logger) def get_thumbnails(wd, want_more_than=0): # wd.execute_script("document.querySelector('{}').click();".format(css_load_more)) thumbnails = wd.find_elements(By.CSS_SELECTOR, css_thumbnail) n_results = len(thumbnails) if n_results <= want_more_than: raise KeyError("no new thumbnails") return thumbnails @retry(exceptions=KeyError, tries=6, delay=0.1, backoff=2, logger=retry_logger) def get_image_src(wd): actual_images = wd.find_elements(By.CSS_SELECTOR, css_large) sources = [] for img in actual_images: src = img.get_attribute("src") if src.startswith("http") and not src.startswith("https://encrypted-tbn0.gstatic.com/"): sources.append(src) if not len(sources): raise KeyError("no large image") return sources @retry(exceptions=selenium_exceptions, tries=6, delay=0.1, backoff=2, logger=retry_logger) def retry_click(el): el.click() def get_images(wd, start=0, n=20, out=None): thumbnails = [] count = len(thumbnails) while count < n: scroll_to_end(wd) try: thumbnails = get_thumbnails(wd, want_more_than=count) except KeyError as e: logger.warning(f"cannot load enough thumbnails: {e}") break count = len(thumbnails) sources = [] for tn in thumbnails: try: wd.execute_script("arguments[0].scrollIntoView();", tn) time.sleep(3) from selenium.webdriver.common.action_chains import ActionChains ActionChains(wd).move_to_element_with_offset(tn, 0, 0).perform() time.sleep(0.1) # find the div with class tn_clickable_div_class at these coordinates x = tn.location["x"] y = tn.location["y"] # find all divs with class tn_clickable_div_class divs = wd.find_elements(By.CLASS_NAME, tn_clickable_div_class) logger.warning(f"divs: {divs}") # find the div that contains the thumbnail div = None for d in divs: if d.location["x"] <= x and d.location["y"] <= y \ and d.location["x"] + d.size["width"] >= x + tn.size["width"] \ and d.location["y"] + d.size["height"] >= y + tn.size["height"]: div = d break div = div or tn logger.warning(f"div: {div.get_attribute('outerHTML')}") logger.warning(f"div.class: {div.get_attribute('class')}") retry_click(div) # Make sure retry_click function is defined elsewhere in your code except selenium_exceptions as e: logger.warning(f"main image click failed: {e}") continue sources1 = [] try: sources1 = get_image_src(wd) except KeyError as e: logger.warning(f"main image not found: {e}") if not sources1: tn_src = tn.get_attribute("src") if not tn_src.startswith("data"): logger.debug("no src found for main image, using thumbnail") sources1 = [tn_src] else: logger.debug("no src found for main image, thumbnail is a data URL") for src in sources1: if not src in sources: sources.append(src) if out: print(src, file=out) out.flush() if len(sources) >= n: break return sources def google_image_search(query, safe="off", n=20, opts='', out=None, wd=None): if wd is None: with start_chrome() as wd: return google_image_search(query, safe=safe, n=n, opts=opts, out=out, wd=wd) if query.startswith("https://"): search_url = query else: search_url_t = "https://www.google.com/search?safe={safe}&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img&tbs={opts}" search_url = search_url_t.format(q=urllib.parse.quote(query), opts=urllib.parse.quote(opts or ''), safe=safe) wd.get(search_url) sources = get_images(wd, n=n, out=out) return sources def install_webdriver(): s=Service(ChromeDriverManager().install()) def start_chrome(install=True): if install: install_webdriver() opts = Options() opts.add_argument("--headless") return webdriver.Chrome(options=opts) def setup_logging(quiet=False, debug=False): if debug: logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)s %(message)s") elif quiet: logging.basicConfig(level=logging.ERROR, format="%(message)s") else: logging.basicConfig(level=logging.INFO, format="%(message)s") def main(): parser = argparse.ArgumentParser(description='Fetch image URLs from Google Image Search.') parser.add_argument('--safe', type=str, default="off", help='safe search [off|active|images]') parser.add_argument('--opts', type=str, default="", help='search options, e.g. isz:lt,islt:svga,itp:photo,ic:color,ift:jpg') parser.add_argument("--debug", action="store_true", help="Enable debug logging") parser.add_argument("--quiet", action="store_true", help="Suppress warnings") parser.add_argument('query', type=str, help='image search query') parser.add_argument('n', type=int, default=20, help='number of images (approx)') args = parser.parse_args() setup_logging(quiet=args.quiet, debug=args.debug) # opts.add_argument("--blink-settings=imagesEnabled=false") # with webdriver.Chrome(options=opts) as wd: # with webdriver.Chrome(service=s, options=opts) as wd: sources = google_image_search(args.query, safe=args.safe, n=args.n, opts=args.opts, out=sys.stdout) if __name__ == "__main__": main()