try avoid rate limit flickr upload to wp direct

my-fix-branch
Shane 7 months ago
parent 30e871e822
commit 022b52a8a7
  1. 206
      foodie_utils.py

@ -227,14 +227,14 @@ def get_image(search_query):
current_time = time.time() current_time = time.time()
time_since_last_request = current_time - last_flickr_request_time time_since_last_request = current_time - last_flickr_request_time
if time_since_last_request < 5: if time_since_last_request < 10:
time.sleep(5 - time_since_last_request) time.sleep(10 - time_since_last_request)
last_flickr_request_time = time.time() last_flickr_request_time = time.time()
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
def search_flickr(query, per_page=20): def search_flickr(query, per_page=5):
try: try:
photos = flickr_api.Photo.search( photos = flickr_api.Photo.search(
text=query, text=query,
@ -270,71 +270,28 @@ def get_image(search_query):
if not img_url or img_url in used_images: if not img_url or img_url in used_images:
return None return None
temp_file = None uploader = photo.owner.username
try: page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
for attempt in range(3):
img_response = requests.get(img_url, headers=headers, timeout=10)
if img_response.status_code == 429:
wait_time = 5 * (2 ** attempt)
logging.warning(f"Rate limit hit for {img_url}. Retrying after {wait_time}s (attempt {attempt+1}/3).")
time.sleep(wait_time)
continue
img_response.raise_for_status()
break
else:
logging.warning(f"Rate limit hit for {img_url} after retries. Falling back to Pixabay.")
return None
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
temp_file.write(img_response.content)
temp_path = temp_file.name
img = Image.open(temp_path)
text = pytesseract.image_to_string(img)
char_count = len(text.strip())
logging.info(f"OCR processed {img_url}: {char_count} characters detected")
if char_count > 200:
logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})")
return None
uploader = photo.owner.username
page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
used_images.add(img_url)
save_used_images()
flickr_data = { used_images.add(img_url)
"title": search_query, save_used_images()
"image_url": img_url,
"source": "Flickr",
"uploader": uploader,
"page_url": page_url,
"timestamp": datetime.now(timezone.utc).isoformat(),
"ocr_chars": char_count
}
flickr_file = "/home/shane/foodie_automator/flickr_images.json"
with open(flickr_file, 'a') as f:
json.dump(flickr_data, f)
f.write('\n')
logging.info(f"Saved Flickr image to {flickr_file}: {img_url}")
logging.info(f"Fallback Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})") flickr_data = {
return img_url, "Flickr", uploader, page_url "title": search_query,
"image_url": img_url,
"source": "Flickr",
"uploader": uploader,
"page_url": page_url,
"timestamp": datetime.now(timezone.utc).isoformat()
}
flickr_file = "/home/shane/foodie_automator/flickr_images.json"
with open(flickr_file, 'a') as f:
json.dump(flickr_data, f)
f.write('\n')
logging.info(f"Saved Flickr image metadata to {flickr_file}: {img_url}")
except requests.exceptions.HTTPError as e: logging.info(f"Fallback Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
if e.response.status_code == 429: return img_url, "Flickr", uploader, page_url
logging.warning(f"Rate limit hit for {img_url} after retries. Falling back to Pixabay.")
return None
else:
logging.warning(f"Download failed for {img_url}: {e}")
return None
except Exception as e:
logging.warning(f"OCR processing failed for {img_url}: {e}")
return None
finally:
if temp_file and os.path.exists(temp_path):
os.unlink(temp_path)
def search_ddg_for_flickr(query): def search_ddg_for_flickr(query):
ddg_query = f"{query} site:flickr.com" ddg_query = f"{query} site:flickr.com"
@ -352,7 +309,7 @@ def get_image(search_query):
photo_id = match.group(1) photo_id = match.group(1)
photo_ids.add(photo_id) photo_ids.add(photo_id)
photo_ids = list(photo_ids)[:5] # Limit to 5 IDs photo_ids = list(photo_ids)[:2] # Limit to 2 IDs
logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}") logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}")
return photo_ids return photo_ids
except Exception as e: except Exception as e:
@ -571,8 +528,28 @@ def upload_image_to_wp(image_url, post_title, wp_base_url, wp_username, wp_passw
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
} }
logging.info(f"Fetching image from {image_url} for '{post_title}'") logging.info(f"Fetching image from {image_url} for '{post_title}'")
image_response = requests.get(image_url, headers=image_headers, timeout=10)
image_response.raise_for_status() # Add rate limit handling for image download
for attempt in range(3):
try:
image_response = requests.get(image_url, headers=image_headers, timeout=10)
if image_response.status_code == 429:
wait_time = 10 * (2 ** attempt) # 10s, 20s, 40s
logging.warning(f"Rate limit hit for {image_url}. Retrying after {wait_time}s (attempt {attempt+1}/3).")
time.sleep(wait_time)
continue
image_response.raise_for_status()
break
except requests.exceptions.HTTPError as e:
if e.response.status_code == 429:
wait_time = 10 * (2 ** attempt)
logging.warning(f"Rate limit hit for {image_url}. Retrying after {wait_time}s (attempt {attempt+1}/3).")
time.sleep(wait_time)
continue
raise
else:
logging.warning(f"Rate limit hit for {image_url} after retries. Failing image upload.")
return None
response = requests.post( response = requests.post(
f"{wp_base_url}/media", f"{wp_base_url}/media",
@ -1044,18 +1021,18 @@ def get_flickr_image(search_query, relevance_keywords):
flickr_request_count += 1 flickr_request_count += 1
logging.info(f"Flickr request count: {flickr_request_count}/3600") logging.info(f"Flickr request count: {flickr_request_count}/3600")
# Enforce a minimum delay of 5 seconds between Flickr requests # Enforce a minimum delay of 10 seconds between Flickr requests
current_time = time.time() current_time = time.time()
time_since_last_request = current_time - last_flickr_request_time time_since_last_request = current_time - last_flickr_request_time
if time_since_last_request < 5: if time_since_last_request < 10:
time.sleep(5 - time_since_last_request) time.sleep(10 - time_since_last_request)
last_flickr_request_time = time.time() last_flickr_request_time = time.time()
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
# Helper function to search Flickr with a given query # Helper function to search Flickr with a given query
def search_flickr(query, per_page=20): def search_flickr(query, per_page=5): # Reduced per_page to limit results
try: try:
photos = flickr_api.Photo.search( photos = flickr_api.Photo.search(
text=query, text=query,
@ -1079,7 +1056,7 @@ def get_flickr_image(search_query, relevance_keywords):
logging.warning(f"Failed to fetch Flickr photo ID {photo_id}: {e}") logging.warning(f"Failed to fetch Flickr photo ID {photo_id}: {e}")
return None return None
# Helper function to process a photo # Helper function to process a photo (fetch URL and metadata only)
def process_photo(photo): def process_photo(photo):
tags = [tag.text.lower() for tag in photo.getTags()] tags = [tag.text.lower() for tag in photo.getTags()]
title = photo.title.lower() if photo.title else "" title = photo.title.lower() if photo.title else ""
@ -1095,71 +1072,28 @@ def get_flickr_image(search_query, relevance_keywords):
if not img_url or img_url in used_images: if not img_url or img_url in used_images:
return None return None
temp_file = None uploader = photo.owner.username
try: page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
for attempt in range(3):
img_response = requests.get(img_url, headers=headers, timeout=10)
if img_response.status_code == 429:
wait_time = 5 * (2 ** attempt)
logging.warning(f"Rate limit hit for {img_url}. Retrying after {wait_time}s (attempt {attempt+1}/3).")
time.sleep(wait_time)
continue
img_response.raise_for_status()
break
else:
logging.warning(f"Rate limit hit for {img_url} after retries. Falling back to Pixabay.")
return None
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
temp_file.write(img_response.content)
temp_path = temp_file.name
img = Image.open(temp_path)
text = pytesseract.image_to_string(img)
char_count = len(text.strip())
logging.info(f"OCR processed {img_url}: {char_count} characters detected")
if char_count > 200:
logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})")
return None
uploader = photo.owner.username used_images.add(img_url)
page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}" save_used_images()
used_images.add(img_url) flickr_data = {
save_used_images() "title": search_query,
"image_url": img_url,
flickr_data = { "source": "Flickr",
"title": search_query, "uploader": uploader,
"image_url": img_url, "page_url": page_url,
"source": "Flickr", "timestamp": datetime.now(timezone.utc).isoformat()
"uploader": uploader, }
"page_url": page_url, flickr_file = "/home/shane/foodie_automator/flickr_images.json"
"timestamp": datetime.now(timezone.utc).isoformat(), with open(flickr_file, 'a') as f:
"ocr_chars": char_count json.dump(flickr_data, f)
} f.write('\n')
flickr_file = "/home/shane/foodie_automator/flickr_images.json" logging.info(f"Saved Flickr image metadata to {flickr_file}: {img_url}")
with open(flickr_file, 'a') as f:
json.dump(flickr_data, f)
f.write('\n')
logging.info(f"Saved Flickr image to {flickr_file}: {img_url}")
logging.info(f"Fetched Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
return img_url, "Flickr", uploader, page_url
except requests.exceptions.HTTPError as e: logging.info(f"Selected Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
if e.response.status_code == 429: return img_url, "Flickr", uploader, page_url
logging.warning(f"Rate limit hit for {img_url} after retries. Falling back to Pixabay.")
return None
else:
logging.warning(f"Download failed for {img_url}: {e}")
return None
except Exception as e:
logging.warning(f"OCR processing failed for {img_url}: {e}")
return None
finally:
if temp_file and os.path.exists(temp_path):
os.unlink(temp_path)
# Helper function to search DDG and extract Flickr photo IDs # Helper function to search DDG and extract Flickr photo IDs
def search_ddg_for_flickr(query): def search_ddg_for_flickr(query):
@ -1178,7 +1112,7 @@ def get_flickr_image(search_query, relevance_keywords):
photo_id = match.group(1) photo_id = match.group(1)
photo_ids.add(photo_id) photo_ids.add(photo_id)
photo_ids = list(photo_ids)[:5] # Limit to 5 IDs photo_ids = list(photo_ids)[:2] # Limit to 2 IDs
logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}") logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}")
return photo_ids return photo_ids
except Exception as e: except Exception as e:

Loading…
Cancel
Save