Latest 0.4.1
Homepage https://github.com/siuying/IGScraperKit
License MIT
Platforms ios 7.0, osx 10.9, requires ARC
Authors

Create dynamic web scraper in Objective-C or Ruby.

Usage

Create a scraper:

#import "IGScraperKit.h"
IGScraper* scraper = [IGScraper scraperWithBlock:^id(IGXMLNode* node, NSString* url) {
                return [[[node queryWithXPath:@"//p"] firstObject] text];
            }];

Then scrape HTML with scraper:

[scraper scrape:@"<html><p>Hello World</p></html>" URL:nil];
// => @"Hello World"
#import "IGScraperKit.h"

IGScraperRecipe* recipe = [[IGScraperRecipe alloc] init];
withScraperBlock:^id(IGXMLNode *node, NSString *url) { // handling for the page ... return data; }];
withScraperBlock:^id(IGXMLNode *node, NSString *url) { // handling for the page ... return data; }];
;

Write Scraper With Ruby

If you want something more dynamic, you can define a Recipe in Ruby:

# A recipe scrape page based on URL
class GoogleRecipe < ScraperKit::Recipe
  title "Google Search"

  # define a HTML scraper by `on url`, where url can be a string or a Regexp
  on %r{https://www.google.com/search?q=.+} do
    # doc is a HTMLDoc object represent the document
    doc.xpath('//h3/a').collect {|node| node.text }
  end

  # if the page you need to parse is not HTML
  # use `on_text url`
  on_text %r{https://www.google.com/search.json.+} do
    # doc is a string of the document
    JSON.parse(doc)
  end
end

Then load the recipe into IGRecipeRegistry and parse the page:

#import "IGScraperKit.h"

// load the recipe
NSString* path = [[NSBundle mainBundle] pathForResource:@"google" ofType:@"rb"];
NSString* recipe = [[NSString alloc] initWithContentsOfFile:path encoding:NSUTF8StringEncoding error:nil];
IGRecipeRegistry* registry = [[IGRecipeRegistry alloc] init];
[registry loadRecipe:Recipe(@"walmart")];

NSArray* result = [registry scrapeWithHTML:html url:@"https://www.google.com/search?q=doughnuts"];
// => <__NSArrayM 0xed85590>(
// Doughnut - Wikipedia, the free encyclopedia,
// Home - Krispy Kreme Doughnuts and Coffee,
// Voodoo Doughnut - The Magic is in the Hole!!!,
//【超人氣甜甜圈】Krispy Kreme Doughnuts 、台北店 ... - Yam天空部落,
// Doughnut Recipes - Allrecipes.com,
// Doughnut Plant,
// Revolution Doughnuts,
// Sidecar Doughnuts & Coffee,
// Top Pot Hand-Forged Doughnuts,
// Lucky's Doughnuts
// )

To use this, you will need to include JavaScriptCore framework (iOS 7, OS X 10.9) and define IGSCRAPERKIT_ENABLE_SCRIPTING before import IGScraperKit.h.

Installation

To install IGScraperKit throught CocoaPods, add following lines to your Podfile:

pod "IGScraperKit", '0.3.1'

Or with Ruby supports:

pod "IGScraperKit/Scripting", '0.3.1'

Dependencies

  • IGScraperKit use IGHTMLQuery for HTML processing. Check the
    ruby wrappers if you
    need to use the Ruby interface.
  • IGScraperKit use JavaScriptCore in iOS 7 and Opal for JavaScript support.
  • Use phantomjs to run Opal tests.

Development

  1. Install gems: In the project folder, run the command: bundle install
  2. Install cocoapods: Run the command: pod install

License

MIT License. Check LICENSE.txt.

Latest podspec

{
    "name": "IGScraperKit",
    "version": "0.4.1",
    "summary": "Create dynamic web scraper in Objective-C or Ruby.",
    "description": "Create dynamic web scraper in Objective-C or Ruby. You can define recipes in Ruby, load and modify them in runtime.n",
    "homepage": "https://github.com/siuying/IGScraperKit",
    "license": {
        "type": "MIT",
        "file": "LICENSE.txt"
    },
    "authors": {
        "Francis Chong": "[email protected]"
    },
    "source": {
        "git": "https://github.com/siuying/IGScraperKit.git",
        "tag": "0.4.1"
    },
    "default_subspecs": "Core",
    "platforms": {
        "ios": "7.0",
        "osx": "10.9"
    },
    "requires_arc": true,
    "libraries": "xml2",
    "xcconfig": {
        "HEADER_SEARCH_PATHS": "$(SDKROOT)/usr/include/libxml2"
    },
    "subspecs": [
        {
            "name": "Core",
            "platforms": {
                "ios": "7.0",
                "osx": "10.9"
            },
            "dependencies": {
                "IGHTMLQuery": [
                    ">= 0.8.3"
                ]
            },
            "source_files": "IGScraperKit/Classes/**/*.{h,m}",
            "exclude_files": "IGScraperKit/Classes/IGRecipeRegistry.{h,m}"
        },
        {
            "name": "Scripting",
            "platforms": {
                "ios": "7.0",
                "osx": "10.9"
            },
            "prefix_header_contents": "#define IGSCRAPERKIT_ENABLE_SCRIPTING",
            "dependencies": {
                "IGHTMLQuery/Ruby": [
                    ">= 0.8.3"
                ],
                "JavaScriptCoreOpalAdditions": [
                    ">= 0.2.4"
                ]
            },
            "source_files": "IGScraperKit/Classes/**/*.{h,m}",
            "resources": [
                "IGScraperKit/JavaScript/**/*.{js}",
                "IGScraperKit/Ruby/**/*.{rb}"
            ]
        }
    ]
}

Pin It on Pinterest

Share This