(notonline)-->
--
-- demo\rosetta\Count_examples.exw
-- ===============================
--
-- (This uses a few '&' instead of/as well as 'a', fairly obviously for everyone's sanity..)
-- Counts no of "{{he&der|" (nb not "=={{he&der|") via web api (but gets tasks via scraping).
-- Since downloading all the pages can be very slow, this uses a cache.
-- Limiting (notdone) by "Phix" fairly obviously speeds it up tenfold :-)
--
without js -- (fairly obviously this will never ever run in a browser!)
constant include_drafts = true,
sort_by_count = false,
-- notlang = "Phix" -- or "" (ie a zero length string) for all
notlang = ""
include rosettacode_cache.e
function count_tasks()
if get_file_type("rc_cache")!=FILETYPE_DIRECTORY then
if not create_directory("rc_cache") then
crash("cannot create rc_cache directory")
end if
end if
-- note this lot use web scraping (as cribbed from a similar task) ...
sequence tasks = dewiki(open_category("Programming_Tasks"))
if include_drafts then
tasks &= dewiki(open_category("Draft_Programming_Tasks"))
tasks = sort(tasks)
end if
if length(notlang) then
-- filter already done in specified language
string langurl = "http://rosettacode.org/wiki/Category:"¬lang
sequence done = dewiki(open_download(notlang&".htm",langurl))
integer k = 0
for i=1 to length(tasks) do
string ti = tasks[i]
integer d = find(ti,done)
if not d then
k += 1
tasks[k] = ti
else
done[d..d] = {}
end if
end for
tasks = tasks[1..k]
done = {}
end if
progress("%d tasks found\n",{length(tasks)})
-- ... whereas the individual tasks use the web api instead (3x smaller/faster)
integer total_count = 0
sequence task_counts = repeat(0,length(tasks))
for i=1 to length(tasks) do
string ti = tasks[i],
url = sprintf("http://rosettacode.org/mw/index.php?title=%s&action=raw",{ti}),
contents = open_download(ti&".raw",url),
prev = "", curr
integer count = 0, start = 1
while true do
start = match(`{{hea`&`der|`,contents,start)
if start=0 then exit end if
--
-- skip duplicates/we also have to cope with eg
-- =={{he&der|Python}}== \
-- ==={{he&der|Python}} Original=== } count
-- ==={{he&der|Python}} Succinct=== } once
-- ==={{he&der|Python}} Recursive === /
-- =={{he&der|Mathematica}} / {{he&der|Wolfram Language}}== \
-- =={{he&der|Icon}} and {{he&der|Unicon}}== } count
-- == {{he&der|Icon}} and {{he&der|Unicon}} == / both
-- == {{he&der|Java}}==
-- etc. Note however that this /does/ count eg
-- ==={{he&der|Applesoft BASIC}}=== \
-- ==={{he&der|BASIC256}}=== } count
-- ==={{he&der|Commodore BASIC}}=== } 'em
-- ==={{he&der|IS-BASIC}}=== } all
-- ==={{he&der|Sinclair ZX81 BASIC}}=== /
--
curr = contents[start..match(`}}`,contents,start+1)]
if curr!=prev then
count += 1
end if
prev = curr
start += length(`{{hea`&`der|`)
end while
if sort_by_count then
task_counts[i] = count
elsif length(notlang) or i<=2 or i>=length(tasks)-1 or mod(i,200)=0 then
progress("%s: %d\n",{html_clean(ti),count})
end if
total_count += count
if get_key()=#1B then progress("escape keyed\n") exit end if
end for
curl_cleanup()
if sort_by_count then
sequence tags = custom_sort(task_counts,tagset(length(tasks)))
for i=length(tags) to 1 by -1 do
integer ti = tags[i]
progress("%s: %d\n",{html_clean(tasks[ti]),task_counts[ti]})
end for
end if
return total_count
end function
progress("Total: %d\n",{count_tasks()})