Module:Webarchive

--[[ --

Lua module implementing the template.

A merger of the functionality of three templates:, and ]]

require('Module:No globals'); local getArgs = require ('Module:Arguments').getArgs; local this_page = mw.title.getCurrentTitle;

----< F O R W A R D  D E C L A R A T I O N S >--

local categories = {};															-- category names from ./data local err_warn_msgs = {};														-- error and warning messages from ./data local excepted_pages = {}; local prefixes = {};															-- service provider tail string prefixes from ./data local services = {};															-- archive service provider data from ./data local uncategorized_namespaces = {};											-- list of namespaces that we should not categorize local uncategorized_subpages = {};												-- list of subpages that should not be categorized

local ulx = {};																	-- Associative array to hold template data local track = {};																-- Associative array to hold tracking categories

--[[--< G L O B A L  C O N F I G U R A T I O N   S E T T I N G S >

]]

local maxurls = 10;																-- Max number of URLs allowed. local tname = 'Webarchive'														-- name of calling template. Change if template rename. local verifydates = 'yes'														-- See documentation. Set "no" to disable.

--[[--< inlineError >---

Critical error. Render output completely in red. Add to tracking category.

]]

local function inlineError(arg, msg) track[categories.error] = 1 return ' Error in webarchive template: Check  value. ' .. msg .. ' ' end

--[[--< inlineRed >---

Render a text fragment in red, such as a warning as part of the final output. Add tracking category.

]]

local function inlineRed(msg, trackmsg) if trackmsg == "warning" then track[categories.warning] = 1; elseif trackmsg == "error" then track[categories.error] = 1; end

return ' ' .. msg .. ' ' end

--[[-- ---

Convert base-62 to base-10 Credit: https://de.wikipedia.org/wiki/Modul:Expr

]]

local function base62( value ) local r = 1																	-- default return value is input value is malformed

if value:match( "^%w+$" ) then												-- value must only be in the set [0-9a-zA-Z] local n = #value														-- number of characters in value local k = 1 local c		r = 0 for i = n, 1, -1 do														-- loop through all characters in value from ls digit to ms digit c = value:byte( i, i ) if c >= 48 and c <= 57 then											-- character is digit 0-9 c = c - 48 elseif c >= 65 and c <= 90 then										-- character is ascii a-z c = c - 55 else																-- must be ascii A-Z c = c - 61 end r = r + c * k														-- accumulate this base62 character's value k = k * 62															-- bump for next end -- for i	end return r end

--[[--< tableLength >---

Given a 1-D table, return number of elements

]]

local function tableLength(T) local count = 0 for _ in pairs(T) do count = count + 1 end return count end

--[[--< dateFormat >---

Given a date string, return its format: dmy, mdy, iso, ymd If unable to determine return nil

]]

local function dateFormat(date)

local patterns = { ['iso'] = '(%d%d%d%d)%-%d%d%-%d%d', ['dmy'] = '%d%d? +%a+ +(%d%d%d%d)', ['mdy'] = '%a+ %d%d?, +(%d%d%d%d)', ['ymd'] = '(%d%d%d%d) +%a+ %d%d?',										-- TODO: not mos compliant; delete? };

local form, y;

for k, v in pairs (patterns) do												-- loop through the patterns table y = mw.ustring.match (date, v);											-- looking for a match if y then																-- not nil when found form = k;															-- save that break;																-- and done end end

return (y and (1900 < tonumber(y) and 2200 > tonumber(y))) and form;		-- TODO: why 1900? shouldn't that be birth-of-intenet year? why 2200? shouldn't that be current year? end

--[[--< makeDate >---

Given a zero-padded 4-digit year, 2-digit month and 2-digit day, return a full date in df format df = mdy, dmy, iso, ymd

on entry, year, month, day are presumed to be correct for the date that they represent; all are required

]]

local function makeDate(year, month, day, df) local format = { ['dmy'] = 'j F Y', ['mdy'] = 'F j, Y', ['ymd'] = 'Y F j', ['iso'] = 'Y-m-d', };

if not year or  == year or not month or  == month or not day or '' == day and format[df] then return nil; end

local date = table.concat ({year, month, day}, '-');						-- assemble iso format date return mw.getContentLanguage:formatDate (format[df], date); end

--[[--< I S _ V A L I D _ D A T E >

Returns true if date is after 31 December 1899 (why is 1900 the min year? shouldn't the internet's date-of-birth be min year?), not after today's date, and represents a valid date (29 February 2017 is not a valid date). Applies Gregorian leapyear rules.

all arguments are required

]]

local function is_valid_date (year, month, day) local days_in_month = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}; local month_length; local y, m, d;	local today = os.date ('*t');												-- fetch a table of current date parts

if not year or  == year or not month or  == month or not day or '' == day then return false;															-- something missing end y = tonumber (year); m = tonumber (month); d = tonumber (day);

if 1900 > y or today.year m or 12 < m then						-- year and month are within bounds	TODO: 1900? return false; end

if (2==m) then																-- if February month_length = 28;														-- then 28 days unless if (0==(y%4) and (0~=(y%100) or 0==(y%400))) then						-- is a leap year? month_length = 29;													-- if leap year then 29 days in February end else month_length=days_in_month[m]; end

if 1 > d or month_length < d then											-- day is within bounds return false; end -- here when date parts represent a valid date return os.time({['year']=y, ['month']=m, ['day']=d, ['hour']=0}) <= os.time;	-- date at midnight must be less than or equal to current date/time end

--[[--< decodeWebciteDate >---

Given a URI-path to Webcite (eg. /67xHmVFWP) return the encoded date in df format

]]

local function decodeWebciteDate(path, df)

local dt = {}

dt = mw.text.split(path, "/")

-- valid URL formats that are not base62

-- http://www.webcitation.org/query?id=1138911916587475 -- http://www.webcitation.org/query?url=http..&date=2012-06-01+21:40:03 -- http://www.webcitation.org/1138911916587475 -- http://www.webcitation.org/cache/73e53dd1f16cf8c5da298418d2a6e452870cf50e -- http://www.webcitation.org/getfile.php?fileid=1c46e791d68e89e12d0c2532cc3cf629b8bc8c8e

if mw.ustring.find( dt[2], "query", 1, true) or 		mw.ustring.find( dt[2], "cache", 1, true) or		mw.ustring.find( dt[2], "getfile", 1, true) or		tonumber(dt[2]) then return "query" end

dt = os.date('*t', string.format("%d", base62(dt[2])):sub(1,10))			-- base62 string -> exponential number -> text -> first 10 characters -> a table of date parts

if not is_valid_date (dt.year, dt.month, dt.day) then return inlineRed(err_warn_msgs.date_err, 'error'); end return makeDate(dt.year, dt.month, dt.day, df) or inlineRed (err_warn_msgs.date4, 'error'); end

--[[--< decodeWaybackDate >---

Given a URI-path to Wayback (eg. /web/20160901010101/http://example.com ) or Library of Congress Web Archives (/all/20160901010101/http://example.com) return the formatted date eg. "September 1, 2016" in df format Handle non-digits in snapshot ID such as "re_" and "-" and "*"

]]

local function decodeWaybackDate(path, df)

local msg, snapdate;

snapdate = path:gsub ('^/all/', ):gsub ('^/web/', ):gsub ('^/', '');	-- remove leading '/all/', leading '/web/' or leading '/' snapdate = snapdate:match ('^[^/]+');										-- get timestamp if snapdate == "*" then														-- eg. /web/*/http.. or /all/*/http.. return "index" end

snapdate = snapdate:gsub ('%a%a_%d?$', ):gsub ('%-', );					-- from date, remove any trailing "re_", dashes

msg = ''; if snapdate:match ('%*$') then												-- a trailing '*' causes calendar display at archive .org snapdate = snapdate:gsub ('%*$', '');									-- remove so not part of length calc later --		msg = inlineRed(err_warn_msgs.ts_cal, 'warning');		-- TODO: enable this				-- make a message end

if not tonumber(snapdate) then return inlineRed (err_warn_msgs.date2, 'error'); end local dlen = string.len(snapdate) if dlen < 8 then															-- we need 8 digits TODO: but shouldn't this be testing for 14 digits? return inlineRed (err_warn_msgs.date3, 'error'); end

local year, month, day = snapdate:match ('(%d%d%d%d)(%d%d)(%d%d)');			-- no need for snapdatelong here

if not is_valid_date (year, month, day) then return inlineRed(err_warn_msgs.date_err, 'error'); end

return makeDate(year, month, day, df) or inlineRed (err_warn_msgs.date7, 'error');

--	snapdate = makeDate(year, month, day, df);								-- TODO: enable this	if snapdate then		if 14 == dlen then			return snapdate, msg;												-- return date with message if any		else			return snapdate, msg .. inlineRed(err_warn_msgs.ts_len, 'warning');	-- return date with warning message(s)		end	else		return inlineRed (err_warn_msgs.date7, 'error');						-- return error message	end end

--[[--< decodeArchiveisDate >---

Given an Archive.is "long link" URI-path (e.g. /2016.08.28-144552/http://example.com) return the date in df format (e.g. if df = dmy, return 28 August 2016) Handles "." and "-" in snapshot date, so 2016.08.28-144552 is same as 20160828144552

]]

local function decodeArchiveisDate(path, df) local snapdate

if path:match ('^/%w+$') then												-- short form url path is '/' followed by some number of base 62 digits and nothing else return "short link"														-- e.g. http://archive.is/hD1qz end

snapdate = mw.text.split (path, '/')[2]:gsub('[%.%-]', '');					-- get snapshot date, e.g. 2016.08.28-144552; remove periods and hyphens

local dlen = string.len(snapdate) if dlen < 8 then															-- we need 8 digits TODO: but shouldn't this be testing for 14 digits? return inlineRed (err_warn_msgs.date3, 'error'); end

local year, month, day = snapdate:match ('(%d%d%d%d)(%d%d)(%d%d)');			-- no need for snapdatelong here

if not is_valid_date (year, month, day) then return inlineRed(err_warn_msgs.date_err, 'error'); end

--	return makeDate(year, month, day, df) or inlineRed (err_warn_msgs.date7, 'error');

snapdate = makeDate(year, month, day, df);								-- TODO: enable this if snapdate then if 14 == dlen then return snapdate;													-- return date else return snapdate, inlineRed(err_warn_msgs.ts_len, 'warning');		-- return date with warning message end else return inlineRed (err_warn_msgs.date7, 'error');						-- return error message end end

--[=[-< M A K E _ W I K I L I N K >

Makes a wikilink; when both link and display text is provided, returns a wikilink in the form L|D; if only link is provided, returns a wikilink in the form L; if neither are provided or link is omitted, returns an empty string.

]=]

local function make_wikilink (link, display, no_link) if nil == no_link then if link and ('' ~= link) then if display and ('' ~= display) then return table.concat ({, display, }); else return table.concat ({, link, }); end end return display or '';													-- link not set so return the display text

else																		-- no_link if display and ('' ~= display) then										-- if there is display text return display;														-- return that else return link or '';													-- return the target article name or empty string end end end

--[[--< serviceName >---

Given a domain extracted by mw.uri.new (eg. web.archive.org) set tail string and service ID

]]

local function serviceName(host, no_link) local tracking; local index; host = host:lower:gsub ('^web%.(.+)', '%1'):gsub ('^www%.(.+)', '%1');	-- lowercase, remove web. and www. subdomains

if services[host] then index = host; else for k, _ in pairs (services) do			if host:find ('%f[%a]'..k:gsub ('([%.%-])', '%%%1')) then index = k;				break; end end end if index then local out = {''};														-- empty string in [1] so that concatenated result has leading single space ulx.url1.service = services[index][4] or 'other'; tracking = services[index][5] or categories.other; -- build tail string if false == services[index][1] then										-- select prefix table.insert (out, prefixes.at); elseif true == services[index][1] then table.insert (out, prefixes.atthe); else table.insert (out, services[index][1]); end table.insert (out, make_wikilink (services[index][2], services[index][3], no_link));	-- add article wikilink if services[index][6] then												-- add tail postfix if it exists table.insert (out, services[index][6]); end ulx.url1.tail = table.concat (out, ' ');								-- put it all together; result has leading space character

else																		-- here when unknown archive ulx.url1.service = 'other'; tracking = categories.unknown; ulx.url1.tail = table.concat ({'', prefixes.at, host, inlineRed (err_warn_msgs.unknown_url)}, ' ');	-- TODO: call to inlineRed does not specify 'error' or 'warning'; should it? end track[tracking] = 1 end

--[[--< parseExtraArgs >---

Parse numbered arguments starting at 2, such as url2..url10, date2..date10, title2..title10 For example: Three url arguments not in numeric sequence (1..4..7). Function only processes arguments numbered 2 or greater (in this case 4 and 7) It creates numeric sequenced table entries like: urlx.url2.url = urlx.url3.url = Returns the number of URL arguments found numbered 2 or greater (in this case returns "2")

]]

local function parseExtraArgs(args)

local i, j, argurl, argurl2, argdate, argtitle

j = 2 for i = 2, maxurls do argurl = "url" .. i		if args[argurl] then argurl2 = "url" .. j			ulx[argurl2] = {} ulx[argurl2]["url"] = args[argurl] argdate = "date" .. j			if args[argdate] then ulx[argurl2]["date"] = args[argdate] else ulx[argurl2]["date"] = inlineRed (err_warn_msgs.date_miss, 'warning'); end argtitle = "title" .. j			if args[argtitle] then ulx[argurl2]["title"] = args[argtitle] else ulx[argurl2]["title"] = nil end j = j + 1 end end

if j == 2 then return 0 else return j - 2 end end

--[[-- ---

Given a date string, return "," if it's MDY

]]

local function comma(date) return (date and date:match ('%a+ +%d%d? +%d%d%d%d')) or ''; end

--[[--< createTracking >---

Return data in track[] ie. tracking categories

]]

local function createTracking if not excepted_pages[this_page.fullText] then								-- namespace:title/fragment is allowed to be categorized (typically this module's / template's testcases page(s)) if uncategorized_namespaces[this_page.nsText] then							-- TODO: enable this chunk return '';																-- this page not to be categorized so return empty string end for _,v in ipairs (uncategorized_subpages) do								-- cycle through page name patterns if this_page.text:match (v) then										-- test page name against each pattern return '';															-- this subpage type not to be categorized so return empty string end end end

local out = {}; if tableLength(track) > 0 then for key, _ in pairs(track) do											-- loop through table table.insert (out, make_wikilink (key));							-- and convert category names to links end end return table.concat (out);													-- concat into one big string; empty string if table is empty

end

--[[--< createRendering >---

Return a rendering of the data in ulx[][]

TODO: when archive date is '*' ('index') leading archive extlink should be [ Archive index] instead of [ Archived] index; code to support this has been added but is commented out for the time being; look for TODO1

]]

local function createRendering

local displayfield local out = {}; local period1 = '';															-- For backwards compat with local period2 = '.';

if 'none' == ulx.url1.format then											-- For , table.insert (out, '[');												-- open extlink markup table.insert (out, ulx.url1.url);										-- add url

if ulx.url1.title then table.insert (out, ' ')												-- the required space table.insert (out, ulx.url1.title)									-- the title table.insert (out, ']');											-- close extlink markup table.insert (out, ulx.url1.tail);									-- tail text if ulx.url1.date then table.insert (out, '&#32;(');									-- open date text; TODO: why the html entity?				table.insert (out, 'index' == ulx.url1.date and 'archive' or 'archived');	-- add text				table.insert (out, ' ');										-- insert a space				table.insert (out, ulx.url1.date);								-- add date				table.insert (out, ')');										-- close date text end else																	-- no title table.insert (out, ' Archived]')									-- close extlink markup TODO1: remove this line --TODO1			table.insert (out, 'index' == ulx.url1.date and ' Archive index]' or ' Archived]');	-- begin link label-- use this line for correct link label when date is 'index' if ulx.url1.date then if 'wayback' == ulx.url1.service then period1 = '.'; period2 = ''; end table.insert (out, table.concat ({' ', ulx.url1.date}));		-- add date	TODO1: remove this line --TODO1				if 'index' ~= ulx.url1.date then						-- TODO1: add this line -- use this if for correct link label when date is 'index'					table.insert (out, ulx.url1.date);							-- add date TODO1: add this line -- use this if for correct link label when date is 'index'				end																-- TODO1: add this line -- use this if for correct link label when date is 'index' 				table.insert (out, comma(ulx.url1.date));						-- add ',' if date format is mdy table.insert (out, ulx.url1.tail);								-- add tail text table.insert (out, period1);									-- terminate else																-- no date table.insert (out, ulx.url1.tail);								-- add tail text end end

if 0 < ulx.url1.extraurls then											-- For multiple archive URLs local tot = ulx.url1.extraurls + 1 table.insert (out, period2);										-- terminate first url table.insert (out, ' Additional archives: ');						-- add header text

for i=2, tot do														-- loop through the additionals local index = table.concat ({'url', i});						-- make an index displayfield = ulx[index]['title'] and 'title' or 'date';		-- choose display text table.insert (out, '[');										-- open extlink markup table.insert (out, ulx[index]['url']);							-- add the url table.insert (out, ' ');										-- the required space table.insert (out, ulx[index][displayfield]);					-- add the label table.insert (out, ']');										-- close extlink markup table.insert (out, i==tot and '.' or ', ');						-- add terminator end end return table.concat (out);												-- make a big string and done

else																		-- For if 'addlarchives' == ulx.url1.format then								-- Multiple archive services table.insert (out, 'Additional archives: ');						-- add header text else																	-- Multiple pages from the same archive table.insert (out, 'Additional pages archived on ');			-- add header text table.insert (out, ulx.url1.date);									-- add date to header text table.insert (out, ': ');											-- close header text end

local tot = ulx.url1.extraurls + 1; for i=1, tot do															-- loop through the additionals local index = table.concat ({'url', i});							-- make an index table.insert (out, '[');											-- open extlink markup table.insert (out, ulx[index]['url']);								-- add url table.insert (out, ' ');											-- add required space

displayfield = ulx[index]['title']; if 'addlarchives' == ulx.url1.format then if not displayfield then displayfield = ulx[index]['date'] end else																-- must be addlpages if not displayfield then displayfield = table.concat ({'Page ', i}); end end table.insert (out, displayfield);									-- add title, date, page label text table.insert (out, ']');											-- close extlink markup table.insert (out, (i==tot and '.' or ', '));							-- add terminator end return table.concat (out);												-- make a big string and done end end

--[[--< W E B A R C H I V E >--

template entry point

TODO: deprecate empty |nolink= as a 'positive' assertion that archive service is not to be linked

]]

local function webarchive(frame) local args = getArgs (frame, {												-- TODO: delete this assignment		valueFunc = function (key, value)										-- this code so that we can detect and handle the oddity that is |nolink=			if 'nolink' == key then												-- |nolink= is 'set' when present with or without assigned value; TODO: deprecate this peculiar use				return value;													-- don't trim; we don't care (right now) what the value is except when nil and we can't trim nil			elseif value then													-- all other values: if the value is not nil				value = mw.text.trim (value);									-- trim whitespace				if '' ~= value then												-- empty string when value was only whitespace or was empty					return value;												-- return non-nil, non-empty values				end			end			return nil;															-- value was nil, empty, or contained only whitespace		end																		-- end of valueFunc		});

--	local args = getArgs (frame);												-- TODO: replace the above with this local data = mw.loadData (table.concat ({									-- make a data module name; sandbox or live 'Module:Webarchive/data', frame:getTitle:find('sandbox', 1, true) and '/sandbox' or ''			-- this instance is ./sandbox then append /sandbox }));	categories = data.categories;												-- fill in the forward declarations err_warn_msgs = data.err_warn_msgs; excepted_pages = data.excepted_pages; prefixes = data.prefixes; services = data.services; uncategorized_namespaces = data.uncategorized_namespaces; uncategorized_subpages = data.uncategorized_subpages;

local date, format, msg, uri, url; verifydates = 'yes' == verifydates;											-- convert to boolean

if args.url and args.url1 then												-- URL argument (first) return inlineError("url", "Conflicting |url= and |url1=.") .. createTracking; end url = args.url or args.url1; if not url then return inlineError("url", "Empty.") .. createTracking end if mw.ustring.find( url, "https://web.http", 1, true ) then					-- track bug - TODO: IAbot bug; not known if the bug has been fixed; deferred track[categories.error] = 1; return inlineError("url", "https://web.http") .. createTracking end if url == "https://web.archive.org/http:/" then								 -- track bug - TODO: IAbot bug; not known if the bug has been fixed; deferred track[categories.error] = 1; return inlineError("url", "Invalid URL") .. createTracking end

ulx.url1 = {} ulx.url1.url = url if not (url:lower:find ('^http') or url:find ('^//')) then				-- TODO: is this a good idea? isn't it better to simply throw an error when url is malformed ... ulx.url1.url = 'http://' .. url											-- ... rather than apply this 'fix' that might not fix anything? end

ulx.url1.extraurls = parseExtraArgs(args)

uri = mw.uri.new (ulx.url1.url);											-- get a table of uri parts from this url serviceName(uri.host, args.nolink)

if args.date and args.date1 then											-- Date argument return inlineError("date", "Conflicting |date= and |date1=.") .. createTracking; end date = args.date or args.date1

if 'wayback' == ulx.url1.service or 'locwebarchives' == ulx.url1.service then if '*' == date then														-- TODO: why is this not compared to url date? date = 'index'; end if date then if verifydates then local ldf = dateFormat(date) if ldf then local udate, msg = decodeWaybackDate( uri.path, ldf )		-- get the url date in the same format as date in |date=; 'index' when wayback date is * if udate ~= date then date = udate .. inlineRed (err_warn_msgs.mismatch, 'warning') .. (msg or '');	-- mismatch us url date; add message if there is one else date = date .. (msg or '');								-- add message if there is one end end end else																	-- no |date= date, msg = decodeWaybackDate( uri.path, "iso" ) if not date then date = inlineRed (err_warn_msgs.date1, 'error');				-- TODO: change this type of message so that it identifies url as source of error? else date = date .. (msg or '');										-- add message if there is one end end

elseif 'webcite' == ulx.url1.service then if date then if verifydates then local ldf = dateFormat(date) if ldf then local udate = decodeWebciteDate( uri.path, ldf )			-- get the url date in the same format as date in |date= if 'query' ~= udate then									-- skip if query if udate ~= date then date = udate .. inlineRed (err_warn_msgs.mismatch, 'warning'); end end end end else date = decodeWebciteDate( uri.path, "iso" ) if date == "query" then date = inlineRed (err_warn_msgs.date_miss, 'warning'); elseif not date then date = inlineRed (err_warn_msgs.date1, 'error'); end end

elseif 'archiveis' == ulx.url1.service then if date then if verifydates then local ldf = dateFormat(date) if ldf then local udate, msg = decodeArchiveisDate( uri.path, ldf )			-- get the url date in the same format as date in |date= if 'short link' ~= udate then								-- skip if short link if udate ~= date then date = udate .. inlineRed (err_warn_msgs.mismatch, 'warning') .. (msg or '');	-- mismatch: use url date; add message if there is one else date = date .. (msg or '');								-- add message if there is one end end end end else																	-- no |date= date, msg = decodeArchiveisDate( uri.path, "iso" ) if date == "short link" then date = inlineRed (err_warn_msgs.date_miss, 'warning'); elseif not date then date = inlineRed (err_warn_msgs.date1, 'error'); else date = date .. (msg or '');										-- add message if there is one end end else																		-- some other service if not date then date = inlineRed (err_warn_msgs.date_miss, 'warning'); end end

ulx.url1.date = date

format = args.format;														-- Format argument

if not format then format = "none" else if format == "addlpages" then if not ulx.url1.date then format = "none" end elseif format == "addlarchives" then format = "addlarchives" else format = "none" end end ulx.url1.format = format

if args.title and args.title1 then											-- Title argument return inlineError("title", "Conflicting |title= and |title1=.") .. createTracking; end

ulx.url1.title = args.title or args.title1;

local rend = createRendering if not rend then rend = ' Error in Template:' .. tname .. ': Unknown problem. Please report on template talk page. '		track[categories.error] = 1; end

return rend .. createTracking end

----< E X P O R T E D 	 F U N C T I O N S >--

return {webarchive = webarchive};