{- LinkMetadata.hs: module for generating Pandoc links which are annotated with metadata, which can then be displayed to the user as 'popups' by /static/js/popups.js. These popups can be excerpts, abstracts, article introductions etc, and make life much more pleasant for the reader - hover over link, popup, read, decide whether to go to link.
Author: Gwern Branwen
Date: 2019-08-20
License: CC-0
-}
-- TODO:
-- 1. fix Unicode handling: `shellToCommand` seems to mangle Unicode, screwing up abstracts
-- 2. scrape more sites: possibilities include predictionbook.com, amazon.com, nature.com, longbets.org, *plos.org, ssrn.com, wiley.com, bmj.com, cran.r-project.org, and rand.org
-- 3. bugs in packages: the WMF API omits the need for `-L` in curl but somehow their live demo works anyway (?!); rxvist doesn't appear to support all biorxiv schemas, including the '/early/' links, forcing me to use curl+Tagsoup; the R library 'fulltext' crashes on examples like `ft_abstract(x = c("10.1038/s41588-018-0183-z"))`
-- fix arxiv PDF links
{-# LANGUAGE OverloadedStrings, DeriveGeneric #-}
module LinkMetadata where
import qualified Data.ByteString.Lazy as B (concat, length, unpack, putStrLn)
import qualified Data.ByteString.Lazy.UTF8 as U -- (encode, decode) -- TODO: why doesn't using U.toString fix the Unicode problems?
import Data.Aeson
import GHC.Generics
import Data.List
import Data.Char
import Data.Maybe
import qualified Data.Map.Strict as M -- (fromList, lookup, Map)
import Text.Pandoc
import qualified Data.Text.IO as TIO
import qualified Data.Text as T
import Data.FileStore.Utils (runShellCommand)
import System.Exit (ExitCode(ExitFailure))
import Data.List.Utils
import System.Directory
import System.IO.Temp
import Network.Api.Arxiv hiding (Link)
import Text.HTML.TagSoup -- (renderTagsOptions,parseTags,renderOptions, optMinimize, Tag(TagOpen))
import Text.Show.Pretty (ppShow)
type Metadata = M.Map Path MetadataItem -- (Title, Author, Date, DOI, Abstract)
type MetadataItem = (String, String, String, String, String)
type MetadataList = [(Path, MetadataItem)]
type Path = String
readLinkMetadata :: IO Metadata
readLinkMetadata = do
-- for hand created definitions, to be saved
custom <- (fmap (read . T.unpack) $ TIO.readFile "static/metadata/custom.hs") :: IO MetadataList
-- auto-generated cached definitions; can be deleted if gone stale
auto <- (fmap (read . T.unpack) $ TIO.readFile "static/metadata/auto.hs") :: IO MetadataList
return $ M.union (M.fromList custom) (M.fromList auto) -- left-biased, 'custom' overrides 'auto'
writeLinkMetadata :: Path -> MetadataItem -> IO ()
writeLinkMetadata l m = do auto <- (fmap (read . T.unpack) $ TIO.readFile "static/metadata/auto.hs") :: IO MetadataList
let auto' = M.insert l m $ M.fromList auto
temp <- writeSystemTempFile "popup-metadata-auto.db.hs" (ppShow $ M.toAscList auto')
renameFile temp "static/metadata/auto.hs"
-- alternative: because of the slowdown from blind rewriting of the db, it may be easier
-- to do one pass, appending everything to a file, and then manually editing file to populate auto.hs:
-- appendFile "static/metadata/auto.hs-tmp" (ppShow [(l, m)])
annotateLink :: Metadata -> Inline -> IO Inline
-- Pandoc types: Link = Link Attr [Inline] Target; Attr = (String, [String], [(String, String)]); Target = (String, String)
annotateLink md x@(Link attr text (target, tooltip)) =
do
-- normalize: convert 'https://www.shawwn.com/docs/foo.pdf' to '/docs/foo.pdf' and './docs/foo.pdf' to '/docs/foo.pdf'
-- the leading '/' indicates this is a local shawwn.com file
let target' = replace "https://www.shawwn.com/" "/" target
let target'' = if head target' == '.' then drop 1 target' else target'
let annotated = M.lookup target'' md
print (attr, text, target, tooltip, annotated)
case annotated of
-- the link has a valid annotation already defined, so build & return
Just l -> return $ constructLink x l
Nothing -> do new <- linkDispatcher target''
case new of
Nothing -> writeLinkMetadata target'' ("", "", "", "", "") >> return x
Just (p,m) -> do
writeLinkMetadata target'' m
return $ constructLink x m
annotateLink _ x = return x
constructLink :: Inline -> MetadataItem -> Inline
constructLink x@(Link _ text (target, tooltip)) (title, author, date, doi, abstract) =
if abstract == "" then x else -- if no abstract, don't bother
Link ("", ["docMetadata"],
(filter (\d -> (snd d) /= "") [("popup-title",title), ("popup-author",author), ("popup-date",date), ("popup-doi",doi), ("popup-abstract",abstract)]))
text (target, tooltip)
constructLink a b = error $ "Error: a non-Link was passed into 'constructLink'! This should never happen." ++ show a ++ " " ++ show b
linkDispatcher, wikipedia, gwern, arxiv, biorxiv :: Path -> IO (Maybe (Path, MetadataItem))
linkDispatcher l | "https://en.wikipedia.org/wiki/" `isPrefixOf` l = wikipedia l
| "https://arxiv.org/abs/" `isPrefixOf` l = arxiv l
| "https://www.biorxiv.org/content/" `isPrefixOf` l = biorxiv l
| "https://www.shawwn.com/" `isPrefixOf` l = gwern (drop 22 l)
| head l == '/' = gwern (drop 1 l)
| otherwise = return Nothing
pdf :: Path -> IO (Maybe (Path, MetadataItem))
pdf p = do (_,_,mb) <- runShellCommand "./" Nothing "exiftool" ["-printFormat", "$Title$/$Author$/$Date$/$DOI", "-Title", "-Author", "-Date", "-DOI", p]
if B.length mb > 0 then
do let (etitle:eauthor:edate:edoi:_) = lines $ U.toString mb
print $ "PDF: " ++ p ++" DOI: " ++ edoi
aMaybe <- doi2Abstract edoi
-- if there is no abstract, there's no point in displaying title/author/date since that's already done by tooltip+URL:
case aMaybe of
Nothing -> return Nothing
Just a -> return $ Just (p, (trim etitle, trim eauthor, trim edate, edoi, a))
else return Nothing
-- nested JSON object: eg 'jq .message.abstract'
data Crossref = Crossref { message :: Message } deriving (Show,Generic)
instance FromJSON Crossref
data Message = Message { abstract :: Maybe String } deriving (Show,Generic)
instance FromJSON Message
doi2Abstract :: [Char] -> IO (Maybe String)
doi2Abstract doi = if length doi <7 then return Nothing
else do (_,_,bs) <- runShellCommand "./" Nothing "curl" ["--location", "--silent", "https://api.crossref.org/works/"++doi, "--user-agent", "gwern@gwern.net"]
if bs=="Resource not found." then return Nothing
else let j = eitherDecode bs :: Either String Crossref
in case j of -- start unwrapping...
Left e -> putStrLn ("Error: Crossref request failed: "++doi++" "++e) >> return Nothing
Right j' -> let j'' = abstract $ message j' in
case j'' of
Nothing -> return Nothing
Just a -> let trimmedAbstract = replace "" "" $ replace "Abstract\n\t " "" $ trim a
in return $ Just trimmedAbstract
data WP = WP { title :: !String, extract_html :: !String } deriving (Show,Generic)
instance FromJSON WP
wikipedia p
| "https://en.wikipedia.org/wiki/Special" `isPrefixOf` p = return Nothing
| "https://en.wikipedia.org/wiki/User:" `isPrefixOf` p = return Nothing
| "https://en.wikipedia.org/wiki/Talk:" `isPrefixOf` p = return Nothing
| "https://en.wikipedia.org/wiki/Category:" `isPrefixOf` p = return Nothing
| otherwise = do let p' = replace "/" "%2F" $ replace "%20" "_" $ drop 30 p
let p'' = [toUpper (head p')] ++ tail p'
let p''' = if '#' `elem` p'' then head $ split "#" p'' else p''
-- print p''
let rq = "https://en.wikipedia.org/api/rest_v1/page/summary/"++p'''++"?redirect=true"
-- `--location` is required or redirects will not be followed by *curl*; '?redirect=true' only makes the *API* follow redirects
(status,_,bs) <- runShellCommand "./" Nothing "curl" ["--location", "--silent", rq, "--user-agent", "gwern+wikipediascraping@gwern.net"]
case status of
ExitFailure _ -> putStrLn ("Wikipedia tooltip failed: " ++ p''') >> return Nothing
_ -> let j = eitherDecode bs :: Either String WP
in case j of
Left e -> putStrLn ("WP request failed: " ++ e ++ " " ++ p ++ " " ++ p''') >> return Nothing
Right wp -> let wp' = wp in
return $ Just (p, (title wp', "English Wikipedia", "", "", extract_html wp'))
biorxiv p = do (status,_,bs) <- runShellCommand "./" Nothing "curl" ["--location", "--silent", p, "--user-agent", "gwern+biorxivscraping@gwern.net"]
case status of
ExitFailure _ -> putStrLn ("Biorxiv download failed: " ++ p) >> return Nothing
_ -> do
let b = U.toString bs
let f = parseTags b
let metas = filter (isTagOpenName "meta") f
let title = concatMap (\x@(TagOpen _ (a:b)) -> if snd a == "DC.Title" then snd $ head b else "") metas
let date = concatMap (\x@(TagOpen _ (a:b)) -> if snd a == "DC.Date" then snd $ head b else "") metas
let author = intercalate ", " $ filter (/="") $ map (\x@(TagOpen _ (a:b)) -> if snd a == "DC.Contributor" then snd $ head b else "") metas
let doi = concatMap (\x@(TagOpen _ (a:b)) -> if snd a == "citation_doi" then snd $ head b else "") metas
let abstract = replace "ABSTRACT
" "" $ replace "Abstract
" "" $ replace "SUMMARY
" "" $
trim $ concatMap (\x@(TagOpen _ (a:b:c)) ->
if snd a == "citation_abstract" then snd $ head c else "") metas
return $ Just (p, (title, author, date, doi, abstract))
arxiv url = do -- Arxiv direct PDF links are deprecated but sometimes sneak through
let arxivid = if "/pdf/" `isInfixOf` url && ".pdf" `isSuffixOf` url
then replace "https://arxiv.org/pdf/" "" $ replace ".pdf" "" url
else replace "https://arxiv.org/abs/" "" url
(status,_,bs) <- runShellCommand "./" Nothing "curl" ["--location","--silent","https://export.arxiv.org/api/query?search_query=id:"++arxivid++"&start=0&max_results=1", "--user-agent", "gwern+arxivscraping@gwern.net"]
case status of
ExitFailure _ -> putStrLn ("Error: on Arxiv ID " ++ arxivid) >> return Nothing
_ -> do let tags = parseTags $ U.toString bs
let at = getTitle $ drop 8 tags
let aau = intercalate ", " $ getAuthorNames tags
let ad = take 10 $ getUpdated tags
let ado = getDoi tags
let aa = trim $ replace "\n" " " $ getSummary tags
return $ Just (url, (at, aau, ad, ado, aa))
trim :: String -> String
trim = reverse . dropWhile (isSpace) . reverse . dropWhile (isSpace) . filter (/='\n')
gwern p | ".pdf" `isSuffixOf` p = pdf p
| otherwise =
do (status,_,bs) <- runShellCommand "./" Nothing "curl" ["--location", "--silent", "https://www.shawwn.com/"++p, "--user-agent", "shawnpresser+gwernscraping@gmail.com"]
case status of
ExitFailure _ -> putStrLn ("Gwern.net download failed: " ++ p) >> return Nothing
_ -> do
let b = U.toString bs
let f = parseTags b
let metas = filter (isTagOpenName "meta") f
let title = concatMap (\x@(TagOpen _ (a:b)) -> if snd a == "title" then snd $ head b else "") metas
let date = concatMap (\x@(TagOpen _ (a:b)) -> if snd a == "dc.date.issued" then snd $ head b else "") metas
let author = concatMap (\x@(TagOpen _ (a:b)) -> if snd a == "author" then snd $ head b else "") metas
let doi = ""
let abstract = trim $ renderTags $ filter filterAbstract $ takeWhile takeToAbstract $ dropWhile dropToAbstract f
let description = concatMap (\x@(TagOpen _ (a:b)) -> if snd a == "description" then snd $ head b else "") metas
-- the description is inferior to the abstract, so we don't want to simply combine them, but if there's no abstract, settle for the description:
let abstract' = if length description > length abstract then description else abstract
return $ Just (p, (title, author, date, doi, abstract'))
where
dropToAbstract (TagOpen "div" [("id", "abstract")]) = False
dropToAbstract _ = True
takeToAbstract (TagClose "div") = False
takeToAbstract _ = True
filterAbstract (TagOpen "div" _) = False
filterAbstract (TagClose "div") = False
filterAbstract (TagOpen "blockquote" _) = False
filterAbstract (TagClose "blockquote") = False
filterAbstract _ = True