defmodule Crawler do
  @moduledoc """
  A very simple, multi-threaded web crawler.
  There are lots of things in this crawler
  that could have been done better, but it works.
  The basic idea is that there is a message center that
  has a complete list of the pages that have been crawled
  and are waiting to be crawled.  The threaded crawlers
  ask the center for a URL, then retrieve the page and then
  send back to the message center all URLs that they find.
  They continue until the message center says to stop.

  Note that this crawler writes a lot of temporary files so it should
  really be the case that those files are written to a local disk
  and ideally that disk should be really fast (SSD).

  author:  gtowell
  created: August 2022
  Modified Dec 2022
  """

  @doc """
  The maximum number of URLs to try to read.
  """
  @maxreads 50

  @doc """
  Read a URL.   The reader stores the body of the URL in a temporary file
  and returns the name of that file, or nil if the URL could not be read.
  """
  def reader(urlstring) do
    :inets.start()
    :ssl.start()
    file_n = "./#{rstring(12)}"
    #IO.puts("#{urlstring}  #{file_n}")
    case :httpc.request(:get, {urlstring, []}, [timeout: :timer.seconds(15)], [stream:  to_charlist(file_n)]) do
      {:ok, :saved_to_file} -> file_n
      _ -> IO.puts "Problem with #{urlstring}"
        nil
    end
  end

  @doc """
  Extract URLs from a read web page. This uses a VERY crude
  extraction approach based on string splitting rather than
  any attempt to really parse the HTML. Hence it is probably error
  error prone.
  """
  def extractor(url, file_name) do
    uri=URI.parse(url)
    {:ok, txt} = File.read(file_name)
    String.split(txt, "href=\"")
    |> List.delete_at(0)
    |> Enum.map(fn v -> [h|_] = String.split(v, "\"", parts: 2 )
        h
        end)
    |> Enum.map(fn v -> URI.merge(uri, URI.parse(v)) end)
  end

  @doc """
  The crawler.   Most of the work is done by the two
  methods reader and extractor.  This method is a recursive loop.
  Each time through the loop it asks the message center for a URL,
  the calls reader and extractor and finally reports to the
  message center any URLs obtained by the extractor.
  """
  def spider(message_center, id_no)  do
    send(message_center, {:get, self() })
    receive do
      {:uri, uristring} ->
        #IO.puts "uri #{uristring}"
        file_name = reader(uristring)
        if file_name != nil do
          uris = extractor(uristring, file_name)
          File.rm(file_name) #delete the temp file
          Enum.each(uris, fn uu -> send(message_center, {:put, uu}) end)
        end
        spider(message_center, id_no)
      {:retry, milli_seconds} ->
        IO.puts "wait #{id_no}"
        :timer.sleep(milli_seconds)
        spider(message_center, id_no)
      {:end} ->
        IO.puts("ENDDD #{id_no}")
    end
  end

  @doc """
  Get the crawler going and wait for it to complete
  """
  def mainThread do
    # start the message center
    {:ok, rcvr} = Task.start_link(Crawler, :loop, [%{}, [], 0])
    # three starting URLs
    send(rcvr, {:put, URI.parse("https://cs.brynmawr.edu/index.html")})
    send(rcvr, {:put, URI.parse("https://cs.brynmawr.edu/cs245/index.html")})
    send(rcvr, {:put, URI.parse("https://cs.brynmawr.edu/cs151/index.html")})

    # start the reading threads and wait for them to end
    # Starts 2 threads for every core.   For the web crawling task, this
    # is really conservative.
    max_concurrency = System.schedulers_online() * 2
    threads = Enum.map(1..max_concurrency, fn sv -> Task.async(Crawler, :spider, [rcvr, sv]) end)
    Task.await_many(threads, :infinity)

    # print for final stats
    send(rcvr, {:print})
    send(rcvr, {:end, self()})
    receive do
      {:final, aa} -> IO.inspect "AAA"
      #Enum.each(aa, fn {k,v} -> IO.write "#{k} ---> "
      #                          IO.inspect v
      #end)
    IO.puts length(Map.keys(aa))
    end
  end

  defp good_url(uu=%URI{}) do
    cond do
      uu.host==nil ->
        false
      not String.contains?(uu.host, "brynmawr.edu") ->
        false
      uu.scheme == "mailto" ->
        false
      uu.scheme == "javascript" ->
        false
      String.length(uu.path) > 100 ->
        false
      true ->
        true
      end
    end

  defp good_url(_) do
    false
  end


  @doc """
  The message center.
  This always receives message is the form of a tuple where the
  first thing in the tuple is an atom stating what the topic
  of the received message. The two main messages are :get in which
  the sender fo the message is asking for a URL and :put in which the
  sender of the message is saying here is a URL I found.

  The function is recursive.  Every time a message is received, that
  message is handled and the function calls itself.
  """
  def loop(waiting_retrieved_map, waiting_list, counter) do
      receive do
        {:print} ->
          # Show some info about the state of the crawl.
          IO.puts "Waiting #{length(waiting_list)}"
          IO.puts "Retrieved #{counter}"
          IO.puts "Seen #{length(Map.keys(waiting_retrieved_map))}"
          loop(waiting_retrieved_map, waiting_list, counter)
        {:end, caller} ->
          # kill the message center
          #IO.puts "Got END"
          send caller, {:final, waiting_retrieved_map}
        {:put, uri} ->
          # received a, possibly, new URL
          uristring = to_string(uri)
          v = Map.get(waiting_retrieved_map, uristring)
          # is this a new URL?
          if v==nil do
            # only crawl the brynmawr.edu domain
            if good_url(uri) do
              waiting_retrieved_map = Map.put(waiting_retrieved_map, uristring, 1)
              waiting_list = [uristring | waiting_list]
              loop(waiting_retrieved_map, waiting_list, counter)
            else
              loop(waiting_retrieved_map, waiting_list, counter)
            end
          else
            # not new, so increment the map value ot show that you have seen it
            waiting_retrieved_map = Map.put(waiting_retrieved_map, uristring, v+1)
            loop(waiting_retrieved_map, waiting_list, counter)
          end
        {:get, caller} ->
          #request for a URL to read
          #Only give out a URL if the max crawl is not exceeded.
          if counter<@maxreads do
          case waiting_list do
            [h|r] ->
              # the list of URLs to be called has at least one member
                waiting_list = r
                send caller, {:uri, h}
                waiting_retrieved_map = Map.put(waiting_retrieved_map, h, Map.get(waiting_retrieved_map,h)+1000)
                loop(waiting_retrieved_map, waiting_list, counter+1)
            _ ->
              # Nothing in the crawl queue.
              # Tell the requester to try again in a while
              send caller, {:retry, 2020}
              loop(waiting_retrieved_map, waiting_list, counter)
          end
        else
          # Max crawl has been reached.   Shut down the requesting spider
          send caller, {:end}
          loop(waiting_retrieved_map, waiting_list, counter)
        end
        end
    end


  @doc """
  Create a random string of the given length
  """
  def rstring(len) do
    Enum.map(1..len, fn _ -> Enum.random('abcdefghijklmnopqrstuvwxyz') end)
    |> to_string
  end


end

Crawler.mainThread()