Pure client Browser approach:
Ok i made this fiddle for you and may help others too. This was interesting to me and challenging. Below are the points on how i achieved the possible solution
- Used ECMA 5 Blob Api to create text file on the fly.
- Loaded http://www.bbc.co.uk/news in iframe (Cross Domain origin access - See Note section below)
- On iframe loaded event trigger a timeout using either setTimeout or
setInterval (Commented - For repetitive execution hourly or daily) as per your need (Adjust time as per your need).
- Querying the text nodes using document.querySelectorAll(".title-link span") seemed
to be generic based on examining the webpage source.
- Check out the fiddler Link
Javascript:
(function () {
var textFile = null,
makeTextFile = function (text) {
var data = new Blob([text], {
type: 'text/plain'
});
// If we are replacing a previously generated file we need to
// manually revoke the object URL to avoid memory leaks.
if (textFile !== null) {
window.URL.revokeObjectURL(textFile);
}
textFile = window.URL.createObjectURL(data);
return textFile;
};
var iframe = document.getElementById('frame');
var commFunc = function () {
var iframe2 = document.getElementById('frame'); //This is required to get the fresh updated DOM
var innerDoc = iframe2.contentDocument || iframe2.contentWindow.document;
var getAll = Array.prototype.slice.call(innerDoc.querySelectorAll(".title-link span"));
var dummy = "";
for (var obj in getAll) {
dummy = dummy.concat("\n" + (getAll[obj]).innerText);
}
var link = document.createElement("a");
link.href = makeTextFile(dummy);
link.download = "sample.txt"
link.click();
console.log("Downloaded the sample.txt file");
};
iframe.onload = function () {
setTimeout(commFunc, 1000); //Adjust the time required to load
//setInterval(commFunc, 1000);
};
//Click the button when the page inside the iframe is loaded
create.addEventListener('click', commFunc);
})();
HTML:
<span class="title-link__title-text">Benefit plan 'could hit young Britons'</span>
<div>
<iframe id="frame" src="http://www.bbc.co.uk/news"></iframe>
</div>
<button id="create">Download</button>
Note:
- To run the above javascript on chrome you need to disable web security.
The above script should run good on firefox, no tweaks needed.
- This is a possible illustration that can be achieved using pure
browser scripting. Tab should be active for periodic grabbing.
- Targetted for modern browsers
Suggested Approach:
Use node.js server and you can modify the above script for to run as
stanalone
Or any server side scripting frameworks like php, java spring etc.
Using Node js approach:
Javascript:
var jsdom = require("node-jsdom");
var fs = require("fs");
jsdom.env({
url: "http://www.bbc.co.uk/news",
scripts: ["http://code.jquery.com/jquery.js"],
done: function (errors, window) {
var $ = window.$;
console.log("HN Links");
$(".title-link span").each(function() {
//console.log(" -", $(this).text());
fs.existsSync("sample.txt") === true ? fs.appendFile("sample.txt", "\r"+ $(this).text()) : fs.writeFile("sample.txt", "\r"+ $(this).text())
});
}
});
Dependencies for the above code:
Hope it helped you and other also