try avoid rate limit flickr upload to wp direct

2025-05-01 18:24:33 +10:00
parent 30e871e822
commit 022b52a8a7
1 changed files with 76 additions and 142 deletions
@@ -227,14 +227,14 @@ def get_image(search_query):
    
    current_time = time.time()
    time_since_last_request = current_time - last_flickr_request_time
-    if time_since_last_request < 5:
-        time.sleep(5 - time_since_last_request)
+    if time_since_last_request < 10:
+        time.sleep(10 - time_since_last_request)
    
    last_flickr_request_time = time.time()
    
    headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
    
-    def search_flickr(query, per_page=20):
+    def search_flickr(query, per_page=5):
        try:
            photos = flickr_api.Photo.search(
                text=query,
@@ -270,71 +270,28 @@ def get_image(search_query):
        if not img_url or img_url in used_images:
            return None
        
-        temp_file = None
-        try:
-            for attempt in range(3):
-                img_response = requests.get(img_url, headers=headers, timeout=10)
-                if img_response.status_code == 429:
-                    wait_time = 5 * (2 ** attempt)
-                    logging.warning(f"Rate limit hit for {img_url}. Retrying after {wait_time}s (attempt {attempt+1}/3).")
-                    time.sleep(wait_time)
-                    continue
-                img_response.raise_for_status()
-                break
-            else:
-                logging.warning(f"Rate limit hit for {img_url} after retries. Falling back to Pixabay.")
-                return None
-
-            with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
-                temp_file.write(img_response.content)
-                temp_path = temp_file.name
-
-            img = Image.open(temp_path)
-            text = pytesseract.image_to_string(img)
-            char_count = len(text.strip())
-            logging.info(f"OCR processed {img_url}: {char_count} characters detected")
-
-            if char_count > 200:
-                logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})")
-                return None
-
-            uploader = photo.owner.username
-            page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
-            
-            used_images.add(img_url)
-            save_used_images()
-            
-            flickr_data = {
-                "title": search_query,
-                "image_url": img_url,
-                "source": "Flickr",
-                "uploader": uploader,
-                "page_url": page_url,
-                "timestamp": datetime.now(timezone.utc).isoformat(),
-                "ocr_chars": char_count
-            }
-            flickr_file = "/home/shane/foodie_automator/flickr_images.json"
-            with open(flickr_file, 'a') as f:
-                json.dump(flickr_data, f)
-                f.write('\n')
-            logging.info(f"Saved Flickr image to {flickr_file}: {img_url}")
-            
-            logging.info(f"Fallback Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
-            return img_url, "Flickr", uploader, page_url
-
-        except requests.exceptions.HTTPError as e:
-            if e.response.status_code == 429:
-                logging.warning(f"Rate limit hit for {img_url} after retries. Falling back to Pixabay.")
-                return None
-            else:
-                logging.warning(f"Download failed for {img_url}: {e}")
-                return None
-        except Exception as e:
-            logging.warning(f"OCR processing failed for {img_url}: {e}")
-            return None
-        finally:
-            if temp_file and os.path.exists(temp_path):
-                os.unlink(temp_path)
+        uploader = photo.owner.username
+        page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
+        
+        used_images.add(img_url)
+        save_used_images()
+        
+        flickr_data = {
+            "title": search_query,
+            "image_url": img_url,
+            "source": "Flickr",
+            "uploader": uploader,
+            "page_url": page_url,
+            "timestamp": datetime.now(timezone.utc).isoformat()
+        }
+        flickr_file = "/home/shane/foodie_automator/flickr_images.json"
+        with open(flickr_file, 'a') as f:
+            json.dump(flickr_data, f)
+            f.write('\n')
+        logging.info(f"Saved Flickr image metadata to {flickr_file}: {img_url}")
+        
+        logging.info(f"Fallback Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
+        return img_url, "Flickr", uploader, page_url

    def search_ddg_for_flickr(query):
        ddg_query = f"{query} site:flickr.com"
@@ -352,7 +309,7 @@ def get_image(search_query):
                    photo_id = match.group(1)
                    photo_ids.add(photo_id)
            
-            photo_ids = list(photo_ids)[:5]  # Limit to 5 IDs
+            photo_ids = list(photo_ids)[:2]  # Limit to 2 IDs
            logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}")
            return photo_ids
        except Exception as e:
@@ -571,9 +528,29 @@ def upload_image_to_wp(image_url, post_title, wp_base_url, wp_username, wp_passw
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        logging.info(f"Fetching image from {image_url} for '{post_title}'")
-        image_response = requests.get(image_url, headers=image_headers, timeout=10)
-        image_response.raise_for_status()
        
+        # Add rate limit handling for image download
+        for attempt in range(3):
+            try:
+                image_response = requests.get(image_url, headers=image_headers, timeout=10)
+                if image_response.status_code == 429:
+                    wait_time = 10 * (2 ** attempt)  # 10s, 20s, 40s
+                    logging.warning(f"Rate limit hit for {image_url}. Retrying after {wait_time}s (attempt {attempt+1}/3).")
+                    time.sleep(wait_time)
+                    continue
+                image_response.raise_for_status()
+                break
+            except requests.exceptions.HTTPError as e:
+                if e.response.status_code == 429:
+                    wait_time = 10 * (2 ** attempt)
+                    logging.warning(f"Rate limit hit for {image_url}. Retrying after {wait_time}s (attempt {attempt+1}/3).")
+                    time.sleep(wait_time)
+                    continue
+                raise
+        else:
+            logging.warning(f"Rate limit hit for {image_url} after retries. Failing image upload.")
+            return None
+
        response = requests.post(
            f"{wp_base_url}/media",
            headers=headers,
@@ -1044,18 +1021,18 @@ def get_flickr_image(search_query, relevance_keywords):
    flickr_request_count += 1
    logging.info(f"Flickr request count: {flickr_request_count}/3600")
    
-    # Enforce a minimum delay of 5 seconds between Flickr requests
+    # Enforce a minimum delay of 10 seconds between Flickr requests
    current_time = time.time()
    time_since_last_request = current_time - last_flickr_request_time
-    if time_since_last_request < 5:
-        time.sleep(5 - time_since_last_request)
+    if time_since_last_request < 10:
+        time.sleep(10 - time_since_last_request)
    
    last_flickr_request_time = time.time()
    
    headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
    
    # Helper function to search Flickr with a given query
-    def search_flickr(query, per_page=20):
+    def search_flickr(query, per_page=5):  # Reduced per_page to limit results
        try:
            photos = flickr_api.Photo.search(
                text=query,
@@ -1079,7 +1056,7 @@ def get_flickr_image(search_query, relevance_keywords):
            logging.warning(f"Failed to fetch Flickr photo ID {photo_id}: {e}")
            return None

-    # Helper function to process a photo
+    # Helper function to process a photo (fetch URL and metadata only)
    def process_photo(photo):
        tags = [tag.text.lower() for tag in photo.getTags()]
        title = photo.title.lower() if photo.title else ""
@@ -1095,71 +1072,28 @@ def get_flickr_image(search_query, relevance_keywords):
        if not img_url or img_url in used_images:
            return None
        
-        temp_file = None
-        try:
-            for attempt in range(3):
-                img_response = requests.get(img_url, headers=headers, timeout=10)
-                if img_response.status_code == 429:
-                    wait_time = 5 * (2 ** attempt)
-                    logging.warning(f"Rate limit hit for {img_url}. Retrying after {wait_time}s (attempt {attempt+1}/3).")
-                    time.sleep(wait_time)
-                    continue
-                img_response.raise_for_status()
-                break
-            else:
-                logging.warning(f"Rate limit hit for {img_url} after retries. Falling back to Pixabay.")
-                return None
-
-            with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
-                temp_file.write(img_response.content)
-                temp_path = temp_file.name
-
-            img = Image.open(temp_path)
-            text = pytesseract.image_to_string(img)
-            char_count = len(text.strip())
-            logging.info(f"OCR processed {img_url}: {char_count} characters detected")
-
-            if char_count > 200:
-                logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})")
-                return None
-
-            uploader = photo.owner.username
-            page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
-            
-            used_images.add(img_url)
-            save_used_images()
-            
-            flickr_data = {
-                "title": search_query,
-                "image_url": img_url,
-                "source": "Flickr",
-                "uploader": uploader,
-                "page_url": page_url,
-                "timestamp": datetime.now(timezone.utc).isoformat(),
-                "ocr_chars": char_count
-            }
-            flickr_file = "/home/shane/foodie_automator/flickr_images.json"
-            with open(flickr_file, 'a') as f:
-                json.dump(flickr_data, f)
-                f.write('\n')
-            logging.info(f"Saved Flickr image to {flickr_file}: {img_url}")
-            
-            logging.info(f"Fetched Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
-            return img_url, "Flickr", uploader, page_url
-
-        except requests.exceptions.HTTPError as e:
-            if e.response.status_code == 429:
-                logging.warning(f"Rate limit hit for {img_url} after retries. Falling back to Pixabay.")
-                return None
-            else:
-                logging.warning(f"Download failed for {img_url}: {e}")
-                return None
-        except Exception as e:
-            logging.warning(f"OCR processing failed for {img_url}: {e}")
-            return None
-        finally:
-            if temp_file and os.path.exists(temp_path):
-                os.unlink(temp_path)
+        uploader = photo.owner.username
+        page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
+        
+        used_images.add(img_url)
+        save_used_images()
+        
+        flickr_data = {
+            "title": search_query,
+            "image_url": img_url,
+            "source": "Flickr",
+            "uploader": uploader,
+            "page_url": page_url,
+            "timestamp": datetime.now(timezone.utc).isoformat()
+        }
+        flickr_file = "/home/shane/foodie_automator/flickr_images.json"
+        with open(flickr_file, 'a') as f:
+            json.dump(flickr_data, f)
+            f.write('\n')
+        logging.info(f"Saved Flickr image metadata to {flickr_file}: {img_url}")
+        
+        logging.info(f"Selected Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
+        return img_url, "Flickr", uploader, page_url

    # Helper function to search DDG and extract Flickr photo IDs
    def search_ddg_for_flickr(query):
@@ -1178,7 +1112,7 @@ def get_flickr_image(search_query, relevance_keywords):
                    photo_id = match.group(1)
                    photo_ids.add(photo_id)
            
-            photo_ids = list(photo_ids)[:5]  # Limit to 5 IDs
+            photo_ids = list(photo_ids)[:2]  # Limit to 2 IDs
            logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}")
            return photo_ids
        except Exception as e: