update whole file

fix
Removed Redundant process_photo
2025-05-03 15:53:22 +10:00 · 2025-05-03 15:21:08 +10:00 · 2025-05-03 15:05:16 +10:00 · 2025-05-03 14:49:06 +10:00 · 2025-05-03 14:35:07 +10:00 · 2025-05-03 14:22:46 +10:00
2 changed files with 190 additions and 329 deletions
@@ -17,7 +17,7 @@ from requests.packages.urllib3.util.retry import Retry
 from requests.adapters import HTTPAdapter
 from foodie_config import (
    RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS,
-    HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES, CTAS,
+    HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES,
    get_clean_source_name, X_API_CREDENTIALS
 )
 from foodie_utils import (
@@ -269,10 +269,14 @@ def curate_from_rss():
        # Fetch image
        image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
        if not image_url:
            logging.info(f"Flickr fetch failed for '{image_query}'. Falling back to Pixabay.")
            image_url, image_source, uploader, page_url = get_image(image_query)
            if not image_url:
                logging.info(f"Pixabay fetch failed for '{image_query}'. Skipping article '{title}'.")
                attempts += 1
                continue
        hook = get_dynamic_hook(post_data["title"]).strip()
        # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=None)
        # Generate viral share prompt
        share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
@@ -236,178 +236,12 @@ def select_best_persona(interest_score, content=""):
    return random.choice(personas)
 def get_image(search_query):
    global last_flickr_request_time, flickr_request_count
    reset_flickr_request_count()
    flickr_request_count += 1
    logging.info(f"Flickr request count: {flickr_request_count}/3600")
    current_time = time.time()
    time_since_last_request = current_time - last_flickr_request_time
    if time_since_last_request < 10:
        time.sleep(10 - time_since_last_request)
    last_flickr_request_time = time.time()
    headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
-    def search_flickr(query, per_page=5):
+    # Try Pixabay with the original query
        try:
            photos = flickr_api.Photo.search(
                text=query,
                per_page=per_page,
                sort='relevance',
                safe_search=1,
                media='photos',
                license='4,5,9,10'
            )
            return photos
        except Exception as e:
            logging.warning(f"Flickr API error for query '{query}': {e}")
            return []
    def fetch_photo_by_id(photo_id):
        try:
            photo = flickr_api.Photo(id=photo_id)
            return photo
        except Exception as e:
            logging.warning(f"Failed to fetch Flickr photo ID {photo_id}: {e}")
            return None
    def process_photo(photo):
        tags = [tag.text.lower() for tag in photo.getTags()]
        title = photo.title.lower() if photo.title else ""
        matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title]
        if matched_keywords:
            logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})")
            return None
        img_url = photo.getPhotoFile(size_label='Medium')
        if not img_url or img_url in used_images:
            return None
        uploader = photo.owner.username
        page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
        used_images.add(img_url)
        save_used_images()
        flickr_data = {
            "title": search_query,
            "image_url": img_url,
            "source": "Flickr",
            "uploader": uploader,
            "page_url": page_url,
            "timestamp": datetime.now(timezone.utc).isoformat()
        }
        flickr_file = "/home/shane/foodie_automator/flickr_images.json"
        with open(flickr_file, 'a') as f:
            json.dump(flickr_data, f)
            f.write('\n')
        logging.info(f"Saved Flickr image metadata to {flickr_file}: {img_url}")
        logging.info(f"Fallback Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
        return img_url, "Flickr", uploader, page_url
    def search_ddg_for_flickr(query):
        ddg_query = f"{query} site:flickr.com"
        ddg_url = f"https://duckduckgo.com/?q={quote(ddg_query)}"
        try:
            response = requests.get(ddg_url, headers=headers, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            photo_ids = set()
            for link in soup.find_all('a', href=True):
                href = link['href']
                match = re.search(r'flickr\.com/photos/[^/]+/(\d+)', href)
                if match:
                    photo_id = match.group(1)
                    photo_ids.add(photo_id)
            photo_ids = list(photo_ids)[:2]  # Limit to 2 IDs
            logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}")
            return photo_ids
        except Exception as e:
            logging.warning(f"DDG search failed for query '{ddg_query}': {e}")
            return set()
    def classify_keywords(keywords):
        prompt = (
            "Given the following keywords from an image search query, classify each as 'specific' (e.g., brand names, unique entities like 'Taco Bell' or 'Paris') or 'generic' (e.g., common or abstract terms like 'dining' or 'trends'). "
            "Return a JSON object mapping each keyword to its classification.\n\n"
            "Keywords: " + ", ".join(keywords) + "\n\n"
            "Example output format (do not use these exact keywords in your response):\n"
            "```json\n"
            "{\n"
            "  \"keyword1\": \"specific\",\n"
            "  \"keyword2\": \"generic\"\n"
            "}\n```"
        )
        try:
            response = client.chat.completions.create(
                model=LIGHT_TASK_MODEL,
                messages=[
                    {"role": "system", "content": "You are a helper that classifies keywords."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=100,
                temperature=0.5
            )
            raw_response = response.choices[0].message.content
            json_match = re.search(r'```json\n([\s\S]*?)\n```', raw_response)
            if not json_match:
                logging.warning(f"Failed to parse keyword classification JSON: {raw_response}")
                return {kw: "specific" for kw in keywords}
            classifications = json.loads(json_match.group(1))
            return classifications
        except Exception as e:
            logging.warning(f"Keyword classification failed: {e}. Defaulting to all specific.")
            return {kw: "specific" for kw in keywords}
    # Step 1: Search DDG to find Flickr photo IDs
    logging.info(f"Searching DDG with query: '{search_query} site:flickr.com'")
    photo_ids = search_ddg_for_flickr(search_query)
    if photo_ids:
        for photo_id in photo_ids:
            photo = fetch_photo_by_id(photo_id)
            if photo:
                result = process_photo(photo)
                if result:
                    return result
    # Step 2: Break down the query into keywords and classify them for direct Flickr API search
    keywords = search_query.lower().split()
    if len(keywords) > 1:
        classifications = classify_keywords(keywords)
        logging.info(f"Keyword classifications: {classifications}")
        specific_keywords = [kw for kw, classification in classifications.items() if classification == "specific"]
        if specific_keywords:
            for keyword in specific_keywords:
                logging.info(f"Searching Flickr with specific keyword: '{keyword}'")
                photos = search_flickr(keyword)
                for photo in photos:
                    result = process_photo(photo)
                    if result:
                        return result
    # Step 3: Final fallback to a generic food-related query
    logging.info(f"No results found. Falling back to generic query: 'food dining'")
    photos = search_flickr("food dining")
    for photo in photos:
        result = process_photo(photo)
        if result:
            return result
    logging.warning(f"No valid Flickr image found in fallback for query '{search_query}'. Trying Pixabay.")
    # Fallback to Pixabay
    try:
        pixabay_url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={quote(search_query)}&image_type=photo&per_page=10"
-        response = requests.get(pixabay_url, timeout=10)
+        response = requests.get(pixabay_url, headers=headers, timeout=10)
        response.raise_for_status()
        data = response.json()
@@ -421,15 +255,43 @@ def get_image(search_query):
            used_images.add(img_url)
            save_used_images()
-            logging.debug(f"Image selected for query '{search_query}': {img_url}")
+            logging.info(f"Selected Pixabay image: {img_url} by {uploader} for query '{search_query}'")
            return img_url, "Pixabay", uploader, page_url
-        logging.warning(f"No valid Pixabay image found for query '{search_query}'.")
+        logging.info(f"No valid Pixabay image found for query '{search_query}'. Trying fallback query.")
        return None, None, None, None
    except Exception as e:
-        logging.error(f"Pixabay image fetch failed for query '{search_query}': {e}")
+        logging.warning(f"Pixabay image fetch failed for query '{search_query}': {e}")
-        return None, None, None, None
+    
    # Fallback to a generic query
    fallback_query = "food dining"
    try:
        pixabay_url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={quote(fallback_query)}&image_type=photo&per_page=10"
        response = requests.get(pixabay_url, headers=headers, timeout=10)
        response.raise_for_status()
        data = response.json()
        for hit in data.get('hits', []):
            img_url = hit.get('webformatURL')
            if not img_url or img_url in used_images:
                continue
            uploader = hit.get('user', 'Unknown')
            page_url = hit.get('pageURL', img_url)
            used_images.add(img_url)
            save_used_images()
            logging.info(f"Selected Pixabay fallback image: {img_url} by {uploader} for query '{fallback_query}'")
            return img_url, "Pixabay", uploader, page_url
        logging.warning(f"No valid Pixabay image found for fallback query '{fallback_query}'.")
    except Exception as e:
        logging.warning(f"Pixabay fallback image fetch failed for query '{fallback_query}': {e}")
    # Ultimate fallback: return None but log clearly
    logging.error(f"All image fetch attempts failed for query '{search_query}'. Returning None.")
    return None, None, None, None
 def generate_image_query(title, summary):
    try:
@@ -1010,24 +872,19 @@ if os.path.exists(used_images_file):
            else:
                data = json.loads(content)
                if not isinstance(data, list):
-                    logging.warning(f"Invalid format in {used_images_file}: expected a list, got {type(data)}. Resetting.")
+                    logging.warning(f"Invalid format in {used_images_file}: expected a list, got {type(data)}. Converting to list.")
-                    data = []
+                    if isinstance(data, dict):
-                else:
+                        # If it's a dict, try to extract URLs from values
-                    # Handle malformed format (list of lists or invalid entries)
+                        data = [v for v in data.values() if isinstance(v, str) and v.startswith('https://')]
-                    flat_data = []
+                    else:
-                    for item in data:
+                        logging.warning(f"Cannot convert {type(data)} to list. Resetting to empty list.")
-                        if isinstance(item, str) and item.startswith('https://'):
+                        data = []
-                            flat_data.append(item)
+                # Filter out non-string or non-URL entries
-                        elif isinstance(item, list):
+                data = [item for item in data if isinstance(item, str) and item.startswith('https://')]
                            logging.warning(f"Fixing malformed entry in {used_images_file}: {item}")
                            flat_data.extend([sub_item for sub_item in item if isinstance(sub_item, str) and sub_item.startswith('https://')])
                        else:
                            logging.warning(f"Skipping invalid entry in {used_images_file}: {item}")
                    data = flat_data
            used_images.update(data)
        logging.info(f"Loaded {len(used_images)} used image URLs from {used_images_file}")
    except Exception as e:
-        logging.warning(f"Failed to load used images from {used_images_file}: {e}. Resetting file.")
+        logging.warning(f"Failed to load used images from {used_images_file}: {e}. Resetting to empty set.")
        used_images = set()
        with open(used_images_file, 'w') as f:
            json.dump([], f)
@@ -1035,17 +892,14 @@ if os.path.exists(used_images_file):
 # Function to save used_images to file
 def save_used_images():
    try:
        # Ensure used_images contains only valid URLs
        valid_urls = [url for url in used_images if isinstance(url, str) and url.startswith('https://')]
        if len(valid_urls) != len(used_images):
            logging.warning(f"Found {len(used_images) - len(valid_urls)} invalid URLs in used_images set")
        with open(used_images_file, 'w') as f:
-            f.write('[\n')
+            json.dump(valid_urls, f, indent=2)
-            urls = list(used_images)
+        logging.info(f"Saved {len(valid_urls)} used image URLs to {used_images_file}")
            for i, url in enumerate(urls):
                f.write(f'"{url}"')
                if i < len(urls) - 1:
                    f.write(',\n')
                else:
                    f.write('\n')
            f.write(']')
        logging.info(f"Saved {len(used_images)} used image URLs to {used_images_file}")
    except Exception as e:
        logging.warning(f"Failed to save used images to {used_images_file}: {e}")
@@ -1055,6 +909,134 @@ def reset_flickr_request_count():
        flickr_request_count = 0
        flickr_request_start_time = time.time()
 def process_photo(photo, search_query):
    tags = [tag.text.lower() for tag in photo.getTags()]
    title = photo.title.lower() if photo.title else ""
    matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title]
    if matched_keywords:
        logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})")
        return None
    # Try 'Large' size first, fall back to 'Medium' if unavailable
    img_url = None
    try:
        img_url = photo.getPhotoFile(size_label='Large')
    except flickr_api.flickrerrors.FlickrError as e:
        logging.info(f"Large size not available for photo {photo.id}: {e}, trying Medium")
        try:
            img_url = photo.getPhotoFile(size_label='Medium')
        except flickr_api.flickrerrors.FlickrError as e:
            logging.warning(f"Medium size not available for photo {photo.id}: {e}")
            return None
    if not img_url or img_url in used_images:
        logging.info(f"Image URL invalid or already used for photo {photo.id}: {img_url}")
        return None
    uploader = photo.owner.username
    page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
    used_images.add(img_url)
    save_used_images()
    flickr_data = {
        "title": search_query,
        "image_url": img_url,
        "source": "Flickr",
        "uploader": uploader,
        "page_url": page_url,
        "timestamp": datetime.now(timezone.utc).isoformat()
    }
    flickr_file = "/home/shane/foodie_automator/flickr_images.json"
    with open(flickr_file, 'a') as f:
        json.dump(flickr_data, f)
        f.write('\n')
    logging.info(f"Saved Flickr image metadata to {flickr_file}: {img_url}")
    logging.info(f"Selected Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
    return img_url, "Flickr", uploader, page_url
 def search_flickr(query, per_page=5):
    try:
        photos = flickr_api.Photo.search(
            text=query,
            per_page=per_page,
            sort='relevance',
            safe_search=1,
            media='photos',
            license='4,5,9,10'
        )
        return photos
    except Exception as e:
        logging.warning(f"Flickr API error for query '{query}': {e}")
        return []
 def fetch_photo_by_id(photo_id):
    try:
        photo = flickr_api.Photo(id=photo_id)
        return photo
    except Exception as e:
        logging.warning(f"Failed to fetch Flickr photo ID {photo_id}: {e}")
        return None
 def search_ddg_for_flickr(query):
    ddg_query = f"{query} site:flickr.com"
    ddg_url = f"https://duckduckgo.com/?q={quote(ddg_query)}"
    try:
        response = requests.get(ddg_url, headers={'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        photo_ids = set()
        for link in soup.find_all('a', href=True):
            href = link['href']
            match = re.search(r'flickr\.com/photos/[^/]+/(\d+)', href)
            if match:
                photo_id = match.group(1)
                photo_ids.add(photo_id)
        photo_ids = list(photo_ids)[:2]  # Limit to 2 IDs
        logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}")
        return photo_ids
    except Exception as e:
        logging.warning(f"DDG search failed for query '{ddg_query}': {e}")
        return set()
 def classify_keywords(keywords):
    prompt = (
        "Given the following keywords from an image search query, classify each as 'specific' (e.g., brand names, unique entities like 'Taco Bell' or 'Paris') or 'generic' (e.g., common or abstract terms like 'dining' or 'trends'). "
        "Return a JSON object mapping each keyword to its classification.\n\n"
        "Keywords: " + ", ".join(keywords) + "\n\n"
        "Example output format (do not use these exact keywords in your response):\n"
        "```json\n"
        "{\n"
        "  \"keyword1\": \"specific\",\n"
        "  \"keyword2\": \"generic\"\n"
        "}\n```"
    )
    try:
        response = client.chat.completions.create(
            model=LIGHT_TASK_MODEL,
            messages=[
                {"role": "system", "content": "You are a helper that classifies keywords."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=100,
            temperature=0.5
        )
        raw_response = response.choices[0].message.content
        json_match = re.search(r'```json\n([\s\S]*?)\n```', raw_response)
        if not json_match:
            logging.warning(f"Failed to parse keyword classification JSON: {raw_response}")
            return {kw: "specific" for kw in keywords}
        classifications = json.loads(json_match.group(1))
        return classifications
    except Exception as e:
        logging.warning(f"Keyword classification failed: {e}. Defaulting to all specific.")
        return {kw: "specific" for kw in keywords}
 def get_flickr_image(search_query, relevance_keywords):
    global last_flickr_request_time, flickr_request_count
@@ -1070,131 +1052,6 @@ def get_flickr_image(search_query, relevance_keywords):
    last_flickr_request_time = time.time()
    headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
    # Helper function to search Flickr with a given query
    def search_flickr(query, per_page=5):  # Reduced per_page to limit results
        try:
            photos = flickr_api.Photo.search(
                text=query,
                per_page=per_page,
                sort='relevance',
                safe_search=1,
                media='photos',
                license='4,5,9,10'
            )
            return photos
        except Exception as e:
            logging.warning(f"Flickr API error for query '{query}': {e}")
            return []
    # Helper function to fetch a Flickr photo by ID
    def fetch_photo_by_id(photo_id):
        try:
            photo = flickr_api.Photo(id=photo_id)
            return photo
        except Exception as e:
            logging.warning(f"Failed to fetch Flickr photo ID {photo_id}: {e}")
            return None
    # Helper function to process a photo (fetch URL and metadata only)
    def process_photo(photo):
        tags = [tag.text.lower() for tag in photo.getTags()]
        title = photo.title.lower() if photo.title else ""
        matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title]
        if matched_keywords:
            logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})")
            return None
        img_url = photo.getPhotoFile(size_label='Large')
        if not img_url:
            img_url = photo.getPhotoFile(size_label='Medium')
        if not img_url or img_url in used_images:
            return None
        uploader = photo.owner.username
        page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
        used_images.add(img_url)
        save_used_images()
        flickr_data = {
            "title": search_query,
            "image_url": img_url,
            "source": "Flickr",
            "uploader": uploader,
            "page_url": page_url,
            "timestamp": datetime.now(timezone.utc).isoformat()
        }
        flickr_file = "/home/shane/foodie_automator/flickr_images.json"
        with open(flickr_file, 'a') as f:
            json.dump(flickr_data, f)
            f.write('\n')
        logging.info(f"Saved Flickr image metadata to {flickr_file}: {img_url}")
        logging.info(f"Selected Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
        return img_url, "Flickr", uploader, page_url
    # Helper function to search DDG and extract Flickr photo IDs
    def search_ddg_for_flickr(query):
        ddg_query = f"{query} site:flickr.com"
        ddg_url = f"https://duckduckgo.com/?q={quote(ddg_query)}"
        try:
            response = requests.get(ddg_url, headers=headers, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            photo_ids = set()
            for link in soup.find_all('a', href=True):
                href = link['href']
                match = re.search(r'flickr\.com/photos/[^/]+/(\d+)', href)
                if match:
                    photo_id = match.group(1)
                    photo_ids.add(photo_id)
            photo_ids = list(photo_ids)[:2]  # Limit to 2 IDs
            logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}")
            return photo_ids
        except Exception as e:
            logging.warning(f"DDG search failed for query '{ddg_query}': {e}")
            return set()
    # Helper function to classify keywords as specific or generic
    def classify_keywords(keywords):
        prompt = (
            "Given the following keywords from an image search query, classify each as 'specific' (e.g., brand names, unique entities like 'Taco Bell' or 'Paris') or 'generic' (e.g., common or abstract terms like 'dining' or 'trends'). "
            "Return a JSON object mapping each keyword to its classification.\n\n"
            "Keywords: " + ", ".join(keywords) + "\n\n"
            "Example output format (do not use these exact keywords in your response):\n"
            "```json\n"
            "{\n"
            "  \"keyword1\": \"specific\",\n"
            "  \"keyword2\": \"generic\"\n"
            "}\n```"
        )
        try:
            response = client.chat.completions.create(
                model=LIGHT_TASK_MODEL,
                messages=[
                    {"role": "system", "content": "You are a helper that classifies keywords."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=100,
                temperature=0.5
            )
            raw_response = response.choices[0].message.content
            json_match = re.search(r'```json\n([\s\S]*?)\n```', raw_response)
            if not json_match:
                logging.warning(f"Failed to parse keyword classification JSON: {raw_response}")
                return {kw: "specific" for kw in keywords}
            classifications = json.loads(json_match.group(1))
            return classifications
        except Exception as e:
            logging.warning(f"Keyword classification failed: {e}. Defaulting to all specific.")
            return {kw: "specific" for kw in keywords}
    # Step 1: Search DDG to find Flickr photo IDs
    logging.info(f"Searching DDG with query: '{search_query} site:flickr.com'")
    photo_ids = search_ddg_for_flickr(search_query)
@@ -1202,7 +1059,7 @@ def get_flickr_image(search_query, relevance_keywords):
        for photo_id in photo_ids:
            photo = fetch_photo_by_id(photo_id)
            if photo:
-                result = process_photo(photo)
+                result = process_photo(photo, search_query)
                if result:
                    return result
@@ -1219,7 +1076,7 @@ def get_flickr_image(search_query, relevance_keywords):
                logging.info(f"Searching Flickr with specific keyword: '{keyword}'")
                photos = search_flickr(keyword)
                for photo in photos:
-                    result = process_photo(photo)
+                    result = process_photo(photo, search_query)
                    if result:
                        return result
@@ -1228,7 +1085,7 @@ def get_flickr_image(search_query, relevance_keywords):
    logging.info(f"No results found. Falling back to generic query: '{fallback_query}'")
    photos = search_flickr(fallback_query)
    for photo in photos:
-        result = process_photo(photo)
+        result = process_photo(photo, search_query)
        if result:
            return result
Author	SHA1	Message	Date
Shane	427a5cb919	update whole file	2025-05-03 15:53:22 +10:00
Shane	6d945dae67	fix	2025-05-03 15:21:08 +10:00
Shane	1fd1ad361b	Removed Redundant process_photo	2025-05-03 15:05:16 +10:00
Shane	a5182bdfb9	fix	2025-05-03 14:49:06 +10:00
Shane	be6514e4e3	fix	2025-05-03 14:35:07 +10:00
Shane	c936555741	fix flickr image large issue	2025-05-03 14:22:46 +10:00
Shane	cdc54f3f14	remove cta import	2025-05-03 14:06:58 +10:00