|
|
|
@ -227,14 +227,14 @@ def get_image(search_query): |
|
|
|
|
|
|
|
|
|
|
|
current_time = time.time() |
|
|
|
current_time = time.time() |
|
|
|
time_since_last_request = current_time - last_flickr_request_time |
|
|
|
time_since_last_request = current_time - last_flickr_request_time |
|
|
|
if time_since_last_request < 5: |
|
|
|
if time_since_last_request < 10: |
|
|
|
time.sleep(5 - time_since_last_request) |
|
|
|
time.sleep(10 - time_since_last_request) |
|
|
|
|
|
|
|
|
|
|
|
last_flickr_request_time = time.time() |
|
|
|
last_flickr_request_time = time.time() |
|
|
|
|
|
|
|
|
|
|
|
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} |
|
|
|
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} |
|
|
|
|
|
|
|
|
|
|
|
def search_flickr(query, per_page=20): |
|
|
|
def search_flickr(query, per_page=5): |
|
|
|
try: |
|
|
|
try: |
|
|
|
photos = flickr_api.Photo.search( |
|
|
|
photos = flickr_api.Photo.search( |
|
|
|
text=query, |
|
|
|
text=query, |
|
|
|
@ -270,71 +270,28 @@ def get_image(search_query): |
|
|
|
if not img_url or img_url in used_images: |
|
|
|
if not img_url or img_url in used_images: |
|
|
|
return None |
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
temp_file = None |
|
|
|
uploader = photo.owner.username |
|
|
|
try: |
|
|
|
page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}" |
|
|
|
for attempt in range(3): |
|
|
|
|
|
|
|
img_response = requests.get(img_url, headers=headers, timeout=10) |
|
|
|
|
|
|
|
if img_response.status_code == 429: |
|
|
|
|
|
|
|
wait_time = 5 * (2 ** attempt) |
|
|
|
|
|
|
|
logging.warning(f"Rate limit hit for {img_url}. Retrying after {wait_time}s (attempt {attempt+1}/3).") |
|
|
|
|
|
|
|
time.sleep(wait_time) |
|
|
|
|
|
|
|
continue |
|
|
|
|
|
|
|
img_response.raise_for_status() |
|
|
|
|
|
|
|
break |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
logging.warning(f"Rate limit hit for {img_url} after retries. Falling back to Pixabay.") |
|
|
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file: |
|
|
|
|
|
|
|
temp_file.write(img_response.content) |
|
|
|
|
|
|
|
temp_path = temp_file.name |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
img = Image.open(temp_path) |
|
|
|
|
|
|
|
text = pytesseract.image_to_string(img) |
|
|
|
|
|
|
|
char_count = len(text.strip()) |
|
|
|
|
|
|
|
logging.info(f"OCR processed {img_url}: {char_count} characters detected") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if char_count > 200: |
|
|
|
|
|
|
|
logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})") |
|
|
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uploader = photo.owner.username |
|
|
|
|
|
|
|
page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
used_images.add(img_url) |
|
|
|
|
|
|
|
save_used_images() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
flickr_data = { |
|
|
|
used_images.add(img_url) |
|
|
|
"title": search_query, |
|
|
|
save_used_images() |
|
|
|
"image_url": img_url, |
|
|
|
|
|
|
|
"source": "Flickr", |
|
|
|
|
|
|
|
"uploader": uploader, |
|
|
|
|
|
|
|
"page_url": page_url, |
|
|
|
|
|
|
|
"timestamp": datetime.now(timezone.utc).isoformat(), |
|
|
|
|
|
|
|
"ocr_chars": char_count |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
flickr_file = "/home/shane/foodie_automator/flickr_images.json" |
|
|
|
|
|
|
|
with open(flickr_file, 'a') as f: |
|
|
|
|
|
|
|
json.dump(flickr_data, f) |
|
|
|
|
|
|
|
f.write('\n') |
|
|
|
|
|
|
|
logging.info(f"Saved Flickr image to {flickr_file}: {img_url}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logging.info(f"Fallback Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})") |
|
|
|
flickr_data = { |
|
|
|
return img_url, "Flickr", uploader, page_url |
|
|
|
"title": search_query, |
|
|
|
|
|
|
|
"image_url": img_url, |
|
|
|
|
|
|
|
"source": "Flickr", |
|
|
|
|
|
|
|
"uploader": uploader, |
|
|
|
|
|
|
|
"page_url": page_url, |
|
|
|
|
|
|
|
"timestamp": datetime.now(timezone.utc).isoformat() |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
flickr_file = "/home/shane/foodie_automator/flickr_images.json" |
|
|
|
|
|
|
|
with open(flickr_file, 'a') as f: |
|
|
|
|
|
|
|
json.dump(flickr_data, f) |
|
|
|
|
|
|
|
f.write('\n') |
|
|
|
|
|
|
|
logging.info(f"Saved Flickr image metadata to {flickr_file}: {img_url}") |
|
|
|
|
|
|
|
|
|
|
|
except requests.exceptions.HTTPError as e: |
|
|
|
logging.info(f"Fallback Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})") |
|
|
|
if e.response.status_code == 429: |
|
|
|
return img_url, "Flickr", uploader, page_url |
|
|
|
logging.warning(f"Rate limit hit for {img_url} after retries. Falling back to Pixabay.") |
|
|
|
|
|
|
|
return None |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
logging.warning(f"Download failed for {img_url}: {e}") |
|
|
|
|
|
|
|
return None |
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
|
|
logging.warning(f"OCR processing failed for {img_url}: {e}") |
|
|
|
|
|
|
|
return None |
|
|
|
|
|
|
|
finally: |
|
|
|
|
|
|
|
if temp_file and os.path.exists(temp_path): |
|
|
|
|
|
|
|
os.unlink(temp_path) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def search_ddg_for_flickr(query): |
|
|
|
def search_ddg_for_flickr(query): |
|
|
|
ddg_query = f"{query} site:flickr.com" |
|
|
|
ddg_query = f"{query} site:flickr.com" |
|
|
|
@ -352,7 +309,7 @@ def get_image(search_query): |
|
|
|
photo_id = match.group(1) |
|
|
|
photo_id = match.group(1) |
|
|
|
photo_ids.add(photo_id) |
|
|
|
photo_ids.add(photo_id) |
|
|
|
|
|
|
|
|
|
|
|
photo_ids = list(photo_ids)[:5] # Limit to 5 IDs |
|
|
|
photo_ids = list(photo_ids)[:2] # Limit to 2 IDs |
|
|
|
logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}") |
|
|
|
logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}") |
|
|
|
return photo_ids |
|
|
|
return photo_ids |
|
|
|
except Exception as e: |
|
|
|
except Exception as e: |
|
|
|
@ -571,8 +528,28 @@ def upload_image_to_wp(image_url, post_title, wp_base_url, wp_username, wp_passw |
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
|
|
|
} |
|
|
|
} |
|
|
|
logging.info(f"Fetching image from {image_url} for '{post_title}'") |
|
|
|
logging.info(f"Fetching image from {image_url} for '{post_title}'") |
|
|
|
image_response = requests.get(image_url, headers=image_headers, timeout=10) |
|
|
|
|
|
|
|
image_response.raise_for_status() |
|
|
|
# Add rate limit handling for image download |
|
|
|
|
|
|
|
for attempt in range(3): |
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
image_response = requests.get(image_url, headers=image_headers, timeout=10) |
|
|
|
|
|
|
|
if image_response.status_code == 429: |
|
|
|
|
|
|
|
wait_time = 10 * (2 ** attempt) # 10s, 20s, 40s |
|
|
|
|
|
|
|
logging.warning(f"Rate limit hit for {image_url}. Retrying after {wait_time}s (attempt {attempt+1}/3).") |
|
|
|
|
|
|
|
time.sleep(wait_time) |
|
|
|
|
|
|
|
continue |
|
|
|
|
|
|
|
image_response.raise_for_status() |
|
|
|
|
|
|
|
break |
|
|
|
|
|
|
|
except requests.exceptions.HTTPError as e: |
|
|
|
|
|
|
|
if e.response.status_code == 429: |
|
|
|
|
|
|
|
wait_time = 10 * (2 ** attempt) |
|
|
|
|
|
|
|
logging.warning(f"Rate limit hit for {image_url}. Retrying after {wait_time}s (attempt {attempt+1}/3).") |
|
|
|
|
|
|
|
time.sleep(wait_time) |
|
|
|
|
|
|
|
continue |
|
|
|
|
|
|
|
raise |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
logging.warning(f"Rate limit hit for {image_url} after retries. Failing image upload.") |
|
|
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
response = requests.post( |
|
|
|
response = requests.post( |
|
|
|
f"{wp_base_url}/media", |
|
|
|
f"{wp_base_url}/media", |
|
|
|
@ -1044,18 +1021,18 @@ def get_flickr_image(search_query, relevance_keywords): |
|
|
|
flickr_request_count += 1 |
|
|
|
flickr_request_count += 1 |
|
|
|
logging.info(f"Flickr request count: {flickr_request_count}/3600") |
|
|
|
logging.info(f"Flickr request count: {flickr_request_count}/3600") |
|
|
|
|
|
|
|
|
|
|
|
# Enforce a minimum delay of 5 seconds between Flickr requests |
|
|
|
# Enforce a minimum delay of 10 seconds between Flickr requests |
|
|
|
current_time = time.time() |
|
|
|
current_time = time.time() |
|
|
|
time_since_last_request = current_time - last_flickr_request_time |
|
|
|
time_since_last_request = current_time - last_flickr_request_time |
|
|
|
if time_since_last_request < 5: |
|
|
|
if time_since_last_request < 10: |
|
|
|
time.sleep(5 - time_since_last_request) |
|
|
|
time.sleep(10 - time_since_last_request) |
|
|
|
|
|
|
|
|
|
|
|
last_flickr_request_time = time.time() |
|
|
|
last_flickr_request_time = time.time() |
|
|
|
|
|
|
|
|
|
|
|
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} |
|
|
|
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} |
|
|
|
|
|
|
|
|
|
|
|
# Helper function to search Flickr with a given query |
|
|
|
# Helper function to search Flickr with a given query |
|
|
|
def search_flickr(query, per_page=20): |
|
|
|
def search_flickr(query, per_page=5): # Reduced per_page to limit results |
|
|
|
try: |
|
|
|
try: |
|
|
|
photos = flickr_api.Photo.search( |
|
|
|
photos = flickr_api.Photo.search( |
|
|
|
text=query, |
|
|
|
text=query, |
|
|
|
@ -1079,7 +1056,7 @@ def get_flickr_image(search_query, relevance_keywords): |
|
|
|
logging.warning(f"Failed to fetch Flickr photo ID {photo_id}: {e}") |
|
|
|
logging.warning(f"Failed to fetch Flickr photo ID {photo_id}: {e}") |
|
|
|
return None |
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
# Helper function to process a photo |
|
|
|
# Helper function to process a photo (fetch URL and metadata only) |
|
|
|
def process_photo(photo): |
|
|
|
def process_photo(photo): |
|
|
|
tags = [tag.text.lower() for tag in photo.getTags()] |
|
|
|
tags = [tag.text.lower() for tag in photo.getTags()] |
|
|
|
title = photo.title.lower() if photo.title else "" |
|
|
|
title = photo.title.lower() if photo.title else "" |
|
|
|
@ -1095,71 +1072,28 @@ def get_flickr_image(search_query, relevance_keywords): |
|
|
|
if not img_url or img_url in used_images: |
|
|
|
if not img_url or img_url in used_images: |
|
|
|
return None |
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
temp_file = None |
|
|
|
uploader = photo.owner.username |
|
|
|
try: |
|
|
|
page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}" |
|
|
|
for attempt in range(3): |
|
|
|
|
|
|
|
img_response = requests.get(img_url, headers=headers, timeout=10) |
|
|
|
|
|
|
|
if img_response.status_code == 429: |
|
|
|
|
|
|
|
wait_time = 5 * (2 ** attempt) |
|
|
|
|
|
|
|
logging.warning(f"Rate limit hit for {img_url}. Retrying after {wait_time}s (attempt {attempt+1}/3).") |
|
|
|
|
|
|
|
time.sleep(wait_time) |
|
|
|
|
|
|
|
continue |
|
|
|
|
|
|
|
img_response.raise_for_status() |
|
|
|
|
|
|
|
break |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
logging.warning(f"Rate limit hit for {img_url} after retries. Falling back to Pixabay.") |
|
|
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file: |
|
|
|
|
|
|
|
temp_file.write(img_response.content) |
|
|
|
|
|
|
|
temp_path = temp_file.name |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
img = Image.open(temp_path) |
|
|
|
|
|
|
|
text = pytesseract.image_to_string(img) |
|
|
|
|
|
|
|
char_count = len(text.strip()) |
|
|
|
|
|
|
|
logging.info(f"OCR processed {img_url}: {char_count} characters detected") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if char_count > 200: |
|
|
|
|
|
|
|
logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})") |
|
|
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uploader = photo.owner.username |
|
|
|
used_images.add(img_url) |
|
|
|
page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}" |
|
|
|
save_used_images() |
|
|
|
|
|
|
|
|
|
|
|
used_images.add(img_url) |
|
|
|
flickr_data = { |
|
|
|
save_used_images() |
|
|
|
"title": search_query, |
|
|
|
|
|
|
|
"image_url": img_url, |
|
|
|
flickr_data = { |
|
|
|
"source": "Flickr", |
|
|
|
"title": search_query, |
|
|
|
"uploader": uploader, |
|
|
|
"image_url": img_url, |
|
|
|
"page_url": page_url, |
|
|
|
"source": "Flickr", |
|
|
|
"timestamp": datetime.now(timezone.utc).isoformat() |
|
|
|
"uploader": uploader, |
|
|
|
} |
|
|
|
"page_url": page_url, |
|
|
|
flickr_file = "/home/shane/foodie_automator/flickr_images.json" |
|
|
|
"timestamp": datetime.now(timezone.utc).isoformat(), |
|
|
|
with open(flickr_file, 'a') as f: |
|
|
|
"ocr_chars": char_count |
|
|
|
json.dump(flickr_data, f) |
|
|
|
} |
|
|
|
f.write('\n') |
|
|
|
flickr_file = "/home/shane/foodie_automator/flickr_images.json" |
|
|
|
logging.info(f"Saved Flickr image metadata to {flickr_file}: {img_url}") |
|
|
|
with open(flickr_file, 'a') as f: |
|
|
|
|
|
|
|
json.dump(flickr_data, f) |
|
|
|
|
|
|
|
f.write('\n') |
|
|
|
|
|
|
|
logging.info(f"Saved Flickr image to {flickr_file}: {img_url}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logging.info(f"Fetched Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})") |
|
|
|
|
|
|
|
return img_url, "Flickr", uploader, page_url |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
except requests.exceptions.HTTPError as e: |
|
|
|
logging.info(f"Selected Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})") |
|
|
|
if e.response.status_code == 429: |
|
|
|
return img_url, "Flickr", uploader, page_url |
|
|
|
logging.warning(f"Rate limit hit for {img_url} after retries. Falling back to Pixabay.") |
|
|
|
|
|
|
|
return None |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
logging.warning(f"Download failed for {img_url}: {e}") |
|
|
|
|
|
|
|
return None |
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
|
|
logging.warning(f"OCR processing failed for {img_url}: {e}") |
|
|
|
|
|
|
|
return None |
|
|
|
|
|
|
|
finally: |
|
|
|
|
|
|
|
if temp_file and os.path.exists(temp_path): |
|
|
|
|
|
|
|
os.unlink(temp_path) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Helper function to search DDG and extract Flickr photo IDs |
|
|
|
# Helper function to search DDG and extract Flickr photo IDs |
|
|
|
def search_ddg_for_flickr(query): |
|
|
|
def search_ddg_for_flickr(query): |
|
|
|
@ -1178,7 +1112,7 @@ def get_flickr_image(search_query, relevance_keywords): |
|
|
|
photo_id = match.group(1) |
|
|
|
photo_id = match.group(1) |
|
|
|
photo_ids.add(photo_id) |
|
|
|
photo_ids.add(photo_id) |
|
|
|
|
|
|
|
|
|
|
|
photo_ids = list(photo_ids)[:5] # Limit to 5 IDs |
|
|
|
photo_ids = list(photo_ids)[:2] # Limit to 2 IDs |
|
|
|
logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}") |
|
|
|
logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}") |
|
|
|
return photo_ids |
|
|
|
return photo_ids |
|
|
|
except Exception as e: |
|
|
|
except Exception as e: |
|
|
|
|