omnisearch/src/Infobox/Dictionary.c

#include "Dictionary.h"
#include "../Cache/Cache.h"
#include "../Proxy/Proxy.h"
#include "../Scraping/Scraping.h"
#include <ctype.h>
#include <curl/curl.h>
#include <libxml/HTMLparser.h>
#include <libxml/xpath.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>

static const char *PREFIXES[] = {"what is the definition of ",
                                 "what's the definition of ",
                                 "what is the meaning of ",
                                 "what's the meaning of ",
                                 "what does the word ",
                                 "definition of ",
                                 "meaning of ",
                                 "def of ",
                                 "define ",
                                 "definition ",
                                 "define:",
                                 "def ",
                                 "def:",
                                 "what does ",
                                 "what is ",
                                 "what's ",
                                 "whats ",
                                 "meaning ",
                                 "dictionary ",
                                 "dict ",
                                 NULL};

static const char *SUFFIXES[] = {
    " definition",  " def",      " meaning", " mean",    " means",
    " dictionary",  " dict",     " define",  " defined", " definition?",
    " def?",        " meaning?", " mean?",   " means?",  " in english",
    " in english?", NULL};

static const char *SKIP_WORDS[] = {"of ", "the ", "a ", "an ", NULL};

static const char *strcasestr_impl(const char *haystack, const char *needle) {
  if (!haystack || !needle || !*needle)
    return haystack;
  size_t len = strlen(needle);
  for (const char *h = haystack; *h; h++) {
    if (strncasecmp(h, needle, len) == 0)
      return h;
  }
  return NULL;
}

struct MemStruct {
  char *memory;
  size_t size;
};

static size_t WriteCallback(void *contents, size_t size, size_t nmemb,
                            void *userp) {
  size_t realsize = size * nmemb;
  struct MemStruct *mem = (struct MemStruct *)userp;
  char *ptr = realloc(mem->memory, mem->size + realsize + 1);
  if (!ptr)
    return 0;
  mem->memory = ptr;
  memcpy(&(mem->memory[mem->size]), contents, realsize);
  mem->size += realsize;
  mem->memory[mem->size] = 0;
  return realsize;
}

static char *xpath_text(xmlDocPtr doc, const char *xpath) {
  xmlXPathContextPtr ctx = xmlXPathNewContext(doc);
  if (!ctx)
    return NULL;
  xmlXPathObjectPtr obj = xmlXPathEvalExpression((const xmlChar *)xpath, ctx);
  xmlXPathFreeContext(ctx);
  if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) {
    if (obj)
      xmlXPathFreeObject(obj);
    return NULL;
  }
  xmlChar *content = xmlNodeGetContent(obj->nodesetval->nodeTab[0]);
  char *result = content ? strdup((char *)content) : NULL;
  if (content)
    xmlFree(content);
  xmlXPathFreeObject(obj);
  return result;
}

static char *build_html(const char *word, const char *pron, const char *pos,
                        const char *def, const char *ex) {
  char html[4096];
  int n = snprintf(html, sizeof(html),
                   "<div class='dict-container' style='line-height: 1.6;'>");
  if (word)
    n += snprintf(html + n, sizeof(html) - n,
                  "<div style='font-size: 1.3em; font-weight: bold; "
                  "margin-bottom: 4px;'>%s</div>",
                  word);
  if (pron)
    n += snprintf(html + n, sizeof(html) - n,
                  "<div style='color: #666; margin-bottom: 8px;'>/%s/</div>",
                  pron);
  if (pos)
    n += snprintf(html + n, sizeof(html) - n,
                  "<div style='font-style: italic; color: #888; margin-bottom: "
                  "8px;'>%s</div>",
                  pos);
  if (def)
    n += snprintf(html + n, sizeof(html) - n,
                  "<div style='margin-bottom: 8px;'>%s</div>", def);
  if (ex)
    n += snprintf(html + n, sizeof(html) - n,
                  "<div style='color: #555; font-style: italic; margin-top: "
                  "8px;'>\"%s\"</div>",
                  ex);
  snprintf(html + n, sizeof(html) - n, "</div>");
  return strdup(html);
}

static char *extract_word(const char *query) {
  if (!query)
    return NULL;

  const char *start = query;

  for (int i = 0; PREFIXES[i]; i++) {
    size_t len = strlen(PREFIXES[i]);
    if (strncasecmp(start, PREFIXES[i], len) == 0) {
      start += len;
      break;
    }
  }

  while (*start == ' ')
    start++;
  char *word = strdup(start);
  if (!word)
    return NULL;

  int changed = 1;
  while (changed) {
    changed = 0;
    for (int i = 0; SKIP_WORDS[i]; i++) {
      size_t len = strlen(SKIP_WORDS[i]);
      if (strncasecmp(word, SKIP_WORDS[i], len) == 0) {
        memmove(word, word + len, strlen(word + len) + 1);
        changed = 1;
        break;
      }
    }
  }

  changed = 1;
  while (changed) {
    changed = 0;
    for (int i = 0; SUFFIXES[i]; i++) {
      const char *found = strcasestr_impl(word, SUFFIXES[i]);
      if (found) {
        char *pos = word + (found - word);
        *pos = '\0';
        changed = 1;
        break;
      }
    }
  }

  size_t len = strlen(word);
  while (len > 0 && (word[len - 1] == ' ' || word[len - 1] == '?' ||
                     word[len - 1] == '!' || word[len - 1] == '.')) {
    word[--len] = '\0';
  }

  if (len == 0) {
    free(word);
    return NULL;
  }

  for (size_t i = 0; i < len; i++)
    word[i] = tolower((unsigned char)word[i]);
  char *space = strchr(word, ' ');
  if (space)
    *space = '\0';

  return word;
}

int is_dictionary_query(const char *query) {
  if (!query)
    return 0;

  for (int i = 0; PREFIXES[i]; i++) {
    size_t len = strlen(PREFIXES[i]);
    if (strncasecmp(query, PREFIXES[i], len) == 0) {
      const char *after = query + len;
      while (*after == ' ')
        after++;
      if (*after != '\0')
        return 1;
    }
  }

  for (int i = 0; SUFFIXES[i]; i++) {
    const char *pos = strcasestr_impl(query, SUFFIXES[i]);
    if (pos) {
      const char *after = pos + strlen(SUFFIXES[i]);
      while (*after == ' ' || *after == '?' || *after == '!' || *after == '.')
        after++;
      if (*after == '\0' && pos > query && (pos - query) < 100)
        return 1;
    }
  }

  if (strncasecmp(query, "what is ", 8) == 0 ||
      strncasecmp(query, "what's ", 7) == 0 ||
      strncasecmp(query, "whats ", 6) == 0) {
    const char *word = query + (strncasecmp(query, "what is ", 8) == 0  ? 8
                                : strncasecmp(query, "what's ", 7) == 0 ? 7
                                                                        : 6);
    const char *articles[] = {"the ",   "your ", "my ",   "his ",  "her ",
                              "their ", "our ",  "this ", "that ", "these ",
                              "those ", "a ",    "an ",   NULL};
    for (int i = 0; articles[i]; i++) {
      if (strncasecmp(word, articles[i], strlen(articles[i])) == 0)
        return 0;
    }
    const char *space = strchr(word, ' ');
    if (!space || *(space + 1) == '\0' || *(space + 1) == '?')
      return 1;
  }

  return 0;
}

char *construct_dictionary_url(const char *query) {
  char *word = extract_word(query);
  if (!word)
    return NULL;

  CURL *curl = curl_easy_init();
  if (!curl) {
    free(word);
    return NULL;
  }

  char *escaped = curl_easy_escape(curl, word, 0);
  const char *base = "https://dictionary.cambridge.org/dictionary/english/";
  char *url = malloc(strlen(base) + strlen(escaped) + 1);
  if (url) {
    strcpy(url, base);
    strcat(url, escaped);
  }

  curl_free(escaped);
  curl_easy_cleanup(curl);
  free(word);
  return url;
}

InfoBox fetch_dictionary_data(const char *query) {
  InfoBox info = {NULL, NULL, NULL, NULL};

  char *url = construct_dictionary_url(query);
  if (!url)
    return info;

  char *cache_key = cache_compute_key(url, 0, "dictionary");
  if (cache_key && get_cache_ttl_infobox() > 0) {
    char *cached_data = NULL;
    size_t cached_size = 0;
    if (cache_get(cache_key, (time_t)get_cache_ttl_infobox(), &cached_data,
                  &cached_size) == 0 &&
        cached_data && cached_size > 0) {
      htmlDocPtr doc = htmlReadMemory(cached_data, cached_size, url, NULL,
                                      HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
                                          HTML_PARSE_NOWARNING);
      if (doc) {
        char *word = xpath_text(doc, "//span[@class='hw dhw']");
        char *pron = xpath_text(
            doc,
            "//span[@class='us dpron-i']//span[@class='ipa dipa lpr-2 lpl-1']");
        char *pos = xpath_text(doc, "//span[@class='pos dpos']");
        char *def = xpath_text(doc, "(//div[@class='def ddef_d db'])[1]");
        char *ex = xpath_text(doc, "(//span[@class='eg deg'])[1]");

        if (word && def) {
          info.title = strdup("Dictionary");
          info.extract = build_html(word, pron, pos, def, ex);
          info.thumbnail_url = strdup("/static/dictionary.jpg");
          info.url = strdup(url);
        }

        free(word);
        free(pron);
        free(pos);
        free(def);
        free(ex);
        xmlFreeDoc(doc);
      }
      free(cached_data);
      free(cache_key);
      free(url);
      return info;
    }
    free(cached_data);
  }
  free(cache_key);

  CURL *curl = curl_easy_init();
  if (!curl) {
    free(url);
    return info;
  }

  struct MemStruct chunk = {malloc(1), 0};
  curl_easy_setopt(curl, CURLOPT_URL, url);
  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
  curl_easy_setopt(curl, CURLOPT_WRITEDATA, &chunk);
  curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0");
  curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
  apply_proxy_settings(curl);

  if (curl_easy_perform(curl) == CURLE_OK && chunk.size > 0) {
    cache_key = cache_compute_key(url, 0, "dictionary");
    if (cache_key && get_cache_ttl_infobox() > 0) {
      cache_set(cache_key, chunk.memory, chunk.size);
    }
    free(cache_key);

    htmlDocPtr doc = htmlReadMemory(chunk.memory, chunk.size, url, NULL,
                                    HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
                                        HTML_PARSE_NOWARNING);
    if (doc) {
      char *word = xpath_text(doc, "//span[@class='hw dhw']");
      char *pron = xpath_text(
          doc,
          "//span[@class='us dpron-i']//span[@class='ipa dipa lpr-2 lpl-1']");
      char *pos = xpath_text(doc, "//span[@class='pos dpos']");
      char *def = xpath_text(doc, "(//div[@class='def ddef_d db'])[1]");
      char *ex = xpath_text(doc, "(//span[@class='eg deg'])[1]");

      if (word && def) {
        info.title = strdup("Dictionary");
        info.extract = build_html(word, pron, pos, def, ex);
        info.thumbnail_url = strdup("/static/dictionary.jpg");
        info.url = strdup(url);
      }

      free(word);
      free(pron);
      free(pos);
      free(def);
      free(ex);
      xmlFreeDoc(doc);
    }
  }

  curl_easy_cleanup(curl);
  free(chunk.memory);
  free(url);
  return info;
}