try avoid rate limit flickr upload to wp direct

2025-05-01 18:24:33 +10:00
parent 30e871e822
commit 022b52a8a7
1 changed files with 76 additions and 142 deletions
@@ -227,14 +227,14 @@ def get_image(search_query):
    current_time = time.time()
    time_since_last_request = current_time - last_flickr_request_time
-    if time_since_last_request < 5:
+    if time_since_last_request < 10:
-        time.sleep(5 - time_since_last_request)
+        time.sleep(10 - time_since_last_request)
    last_flickr_request_time = time.time()
    headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
-    def search_flickr(query, per_page=20):
+    def search_flickr(query, per_page=5):
        try:
            photos = flickr_api.Photo.search(
                text=query,
@@ -270,71 +270,28 @@ def get_image(search_query):
        if not img_url or img_url in used_images:
            return None
-        temp_file = None
+        uploader = photo.owner.username
-        try:
+        page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
-            for attempt in range(3):
+        
-                img_response = requests.get(img_url, headers=headers, timeout=10)
+        used_images.add(img_url)
-                if img_response.status_code == 429:
+        save_used_images()
-                    wait_time = 5 * (2 ** attempt)
+        
-                    logging.warning(f"Rate limit hit for {img_url}. Retrying after {wait_time}s (attempt {attempt+1}/3).")
+        flickr_data = {
-                    time.sleep(wait_time)
+            "title": search_query,
-                    continue
+            "image_url": img_url,
-                img_response.raise_for_status()
+            "source": "Flickr",
-                break
+            "uploader": uploader,
-            else:
+            "page_url": page_url,
-                logging.warning(f"Rate limit hit for {img_url} after retries. Falling back to Pixabay.")
+            "timestamp": datetime.now(timezone.utc).isoformat()
-                return None
+        }
-
+        flickr_file = "/home/shane/foodie_automator/flickr_images.json"
-            with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
+        with open(flickr_file, 'a') as f:
-                temp_file.write(img_response.content)
+            json.dump(flickr_data, f)
-                temp_path = temp_file.name
+            f.write('\n')
-
+        logging.info(f"Saved Flickr image metadata to {flickr_file}: {img_url}")
-            img = Image.open(temp_path)
+        
-            text = pytesseract.image_to_string(img)
+        logging.info(f"Fallback Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
-            char_count = len(text.strip())
+        return img_url, "Flickr", uploader, page_url
            logging.info(f"OCR processed {img_url}: {char_count} characters detected")
            if char_count > 200:
                logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})")
                return None
            uploader = photo.owner.username
            page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
            used_images.add(img_url)
            save_used_images()
            flickr_data = {
                "title": search_query,
                "image_url": img_url,
                "source": "Flickr",
                "uploader": uploader,
                "page_url": page_url,
                "timestamp": datetime.now(timezone.utc).isoformat(),
                "ocr_chars": char_count
            }
            flickr_file = "/home/shane/foodie_automator/flickr_images.json"
            with open(flickr_file, 'a') as f:
                json.dump(flickr_data, f)
                f.write('\n')
            logging.info(f"Saved Flickr image to {flickr_file}: {img_url}")
            logging.info(f"Fallback Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
            return img_url, "Flickr", uploader, page_url
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 429:
                logging.warning(f"Rate limit hit for {img_url} after retries. Falling back to Pixabay.")
                return None
            else:
                logging.warning(f"Download failed for {img_url}: {e}")
                return None
        except Exception as e:
            logging.warning(f"OCR processing failed for {img_url}: {e}")
            return None
        finally:
            if temp_file and os.path.exists(temp_path):
                os.unlink(temp_path)
    def search_ddg_for_flickr(query):
        ddg_query = f"{query} site:flickr.com"
@@ -352,7 +309,7 @@ def get_image(search_query):
                    photo_id = match.group(1)
                    photo_ids.add(photo_id)
-            photo_ids = list(photo_ids)[:5]  # Limit to 5 IDs
+            photo_ids = list(photo_ids)[:2]  # Limit to 2 IDs
            logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}")
            return photo_ids
        except Exception as e:
@@ -571,9 +528,29 @@ def upload_image_to_wp(image_url, post_title, wp_base_url, wp_username, wp_passw
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        logging.info(f"Fetching image from {image_url} for '{post_title}'")
        image_response = requests.get(image_url, headers=image_headers, timeout=10)
        image_response.raise_for_status()
        # Add rate limit handling for image download
        for attempt in range(3):
            try:
                image_response = requests.get(image_url, headers=image_headers, timeout=10)
                if image_response.status_code == 429:
                    wait_time = 10 * (2 ** attempt)  # 10s, 20s, 40s
                    logging.warning(f"Rate limit hit for {image_url}. Retrying after {wait_time}s (attempt {attempt+1}/3).")
                    time.sleep(wait_time)
                    continue
                image_response.raise_for_status()
                break
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 429:
                    wait_time = 10 * (2 ** attempt)
                    logging.warning(f"Rate limit hit for {image_url}. Retrying after {wait_time}s (attempt {attempt+1}/3).")
                    time.sleep(wait_time)
                    continue
                raise
        else:
            logging.warning(f"Rate limit hit for {image_url} after retries. Failing image upload.")
            return None
        response = requests.post(
            f"{wp_base_url}/media",
            headers=headers,
@@ -1044,18 +1021,18 @@ def get_flickr_image(search_query, relevance_keywords):
    flickr_request_count += 1
    logging.info(f"Flickr request count: {flickr_request_count}/3600")
-    # Enforce a minimum delay of 5 seconds between Flickr requests
+    # Enforce a minimum delay of 10 seconds between Flickr requests
    current_time = time.time()
    time_since_last_request = current_time - last_flickr_request_time
-    if time_since_last_request < 5:
+    if time_since_last_request < 10:
-        time.sleep(5 - time_since_last_request)
+        time.sleep(10 - time_since_last_request)
    last_flickr_request_time = time.time()
    headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
    # Helper function to search Flickr with a given query
-    def search_flickr(query, per_page=20):
+    def search_flickr(query, per_page=5):  # Reduced per_page to limit results
        try:
            photos = flickr_api.Photo.search(
                text=query,
@@ -1079,7 +1056,7 @@ def get_flickr_image(search_query, relevance_keywords):
            logging.warning(f"Failed to fetch Flickr photo ID {photo_id}: {e}")
            return None
-    # Helper function to process a photo
+    # Helper function to process a photo (fetch URL and metadata only)
    def process_photo(photo):
        tags = [tag.text.lower() for tag in photo.getTags()]
        title = photo.title.lower() if photo.title else ""
@@ -1095,71 +1072,28 @@ def get_flickr_image(search_query, relevance_keywords):
        if not img_url or img_url in used_images:
            return None
-        temp_file = None
+        uploader = photo.owner.username
-        try:
+        page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
-            for attempt in range(3):
+        
-                img_response = requests.get(img_url, headers=headers, timeout=10)
+        used_images.add(img_url)
-                if img_response.status_code == 429:
+        save_used_images()
-                    wait_time = 5 * (2 ** attempt)
+        
-                    logging.warning(f"Rate limit hit for {img_url}. Retrying after {wait_time}s (attempt {attempt+1}/3).")
+        flickr_data = {
-                    time.sleep(wait_time)
+            "title": search_query,
-                    continue
+            "image_url": img_url,
-                img_response.raise_for_status()
+            "source": "Flickr",
-                break
+            "uploader": uploader,
-            else:
+            "page_url": page_url,
-                logging.warning(f"Rate limit hit for {img_url} after retries. Falling back to Pixabay.")
+            "timestamp": datetime.now(timezone.utc).isoformat()
-                return None
+        }
-
+        flickr_file = "/home/shane/foodie_automator/flickr_images.json"
-            with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
+        with open(flickr_file, 'a') as f:
-                temp_file.write(img_response.content)
+            json.dump(flickr_data, f)
-                temp_path = temp_file.name
+            f.write('\n')
-
+        logging.info(f"Saved Flickr image metadata to {flickr_file}: {img_url}")
-            img = Image.open(temp_path)
+        
-            text = pytesseract.image_to_string(img)
+        logging.info(f"Selected Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
-            char_count = len(text.strip())
+        return img_url, "Flickr", uploader, page_url
            logging.info(f"OCR processed {img_url}: {char_count} characters detected")
            if char_count > 200:
                logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})")
                return None
            uploader = photo.owner.username
            page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
            used_images.add(img_url)
            save_used_images()
            flickr_data = {
                "title": search_query,
                "image_url": img_url,
                "source": "Flickr",
                "uploader": uploader,
                "page_url": page_url,
                "timestamp": datetime.now(timezone.utc).isoformat(),
                "ocr_chars": char_count
            }
            flickr_file = "/home/shane/foodie_automator/flickr_images.json"
            with open(flickr_file, 'a') as f:
                json.dump(flickr_data, f)
                f.write('\n')
            logging.info(f"Saved Flickr image to {flickr_file}: {img_url}")
            logging.info(f"Fetched Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
            return img_url, "Flickr", uploader, page_url
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 429:
                logging.warning(f"Rate limit hit for {img_url} after retries. Falling back to Pixabay.")
                return None
            else:
                logging.warning(f"Download failed for {img_url}: {e}")
                return None
        except Exception as e:
            logging.warning(f"OCR processing failed for {img_url}: {e}")
            return None
        finally:
            if temp_file and os.path.exists(temp_path):
                os.unlink(temp_path)
    # Helper function to search DDG and extract Flickr photo IDs
    def search_ddg_for_flickr(query):
@@ -1178,7 +1112,7 @@ def get_flickr_image(search_query, relevance_keywords):
                    photo_id = match.group(1)
                    photo_ids.add(photo_id)
-            photo_ids = list(photo_ids)[:5]  # Limit to 5 IDs
+            photo_ids = list(photo_ids)[:2]  # Limit to 2 IDs
            logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}")
            return photo_ids
        except Exception as e: