Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

File Match Rule no filtering unwanted result #46

Open
kenkenchow opened this issue Jul 18, 2019 · 2 comments
Open

File Match Rule no filtering unwanted result #46

kenkenchow opened this issue Jul 18, 2019 · 2 comments

Comments

@kenkenchow
Copy link

I want to filter out the path that end with search.asp* but no luck. May I know how to fix it with below gopa.yml?

file_match_rule:
must:
prefix: []
contain: []
suffix: []
should:
prefix: []
contain: []
suffix: []
must_not:
prefix:
- /search.asp
- search.asp
contain:
- /search.asp
- search.asp
suffix:
- /search.asp
- search.asp

@medcl
Copy link
Member

medcl commented Jul 20, 2019

may i got the full config?

@kenkenchow
Copy link
Author

kenkenchow commented Aug 1, 2019

cluster:
name: gopa
#node.name: node1

path.data: data
path.logs: log
path.certs: test/cert

cookie_secret: YOUR-GOPA-SECRET

allow_multi_instance: true
max_num_of_instances: 1

tls_enabled: false

logging.level: debug

network:
host: 127.0.0.1

cluster.seeds:

  • 127.0.0.1:10000

modules:

  • name: pipeline
    enabled: true
    runners:

    • name: checker
      enabled: true
      input_queue: check
      max_go_routine: 5
      threshold_in_ms: 0
      timeout_in_ms: 5000
      default_config:
      start:
      joint: init_task
      enabled: true
      process:
      - joint: url_normalization
      enabled: true
      parameters:
      follow_all_domain: false
      follow_sub_domain: true
      max_filename_length: 1000
      - joint: url_filter
      enabled: true
      parameters:
      url_match_rule:
      must_not:
      prefix:
      - "mailto:"
      - "data:image/"
      - "#"
      - "javascript:"
      contain:
      - "crond/run"
      - "login"
      - "logout"
      suffix: []
      must:
      prefix: []
      contain: []
      suffix: []
      should:
      prefix: []
      contain: []
      suffix: []
      host_match_rule:
      must:
      prefix: []
      contain: []
      suffix: []
      should:
      prefix: []
      contain:
      - xxxxx.xxxx.com
      suffix: []
      must_not:
      prefix: []
      contain: []
      suffix: []
      file_match_rule:
      must:
      prefix: []
      contain: []
      suffix: []
      should:
      prefix: []
      contain: []
      suffix: []
      must_not:
      prefix: []
      contain:
      - search
      - search.asp
      suffix: []
      file_ext_match_rule:
      should:
      prefix: []
      contain: []
      suffix: []
      must:
      prefix: []
      contain: []
      suffix: []
      must_not:
      contain: [zip, exe, jar, js, css, rar, gz, zip, bmp, jpeg, gif,svg, png, jpg, apk]
      prefix: []
      suffix: []
      - joint: filter_check
      enabled: true
      parameters:
      filter_key: check_filter
      - joint: task_deduplication
      enabled: true
      end:
      joint: save_task
      enabled: true
      parameters:
      is_create: true
    • name: fetch
      enabled: true
      input_queue: fetch
      max_go_routine: 1
      threshold_in_ms: 0
      timeout_in_ms: 30000
      default_config:
      start:
      joint: init_task
      enabled: true
      process:
      - joint: url_normalization
      enabled: true
      parameters:
      follow_all_domain: false
      follow_sub_domain: true
      max_filename_length: 1000
      - joint: ignore_timeout
      parameters:
      ignore_timeout_after_count: 500
      enabled: true
      - joint: chrome_fetch
      enabled: false
      parameters:
      chrome_host: http://127.0.0.1:9223
      save_screenshot: true
      compress_enabled: true
      timeout_in_seconds: 60
      screenshot_quality: 50
      - joint: fetch
      enabled: true
      parameters:
      timeout_in_seconds: 100
      - joint: parse
      enabled: true
      parameters:
      dispatch_links: true
      save_images: true
      max_depth: 100
      max_breadth: 1
      - joint: extract
      enabled: false
      parameters:
      html_block:
      your_tag_name1: ".tag_class"
      your_tag_name2: "#tag_id"
      - joint: html2text
      parameters:
      remove_nonscript: true
      merge_whitespace: false
      enabled: true
      - joint: hash
      enabled: true
      - joint: content_deduplication
      enabled: true
      - joint: update_check_time
      enabled: true
      parameters:
      decelerate_steps: "24h,48h,72h,168h,360h,720h"
      accelerate_steps: "24h,12h,6h,3h,1h30m,45m,30m,20m,10m"
      - joint: lang_detect
      enabled: true
      - joint : save_snapshot_fs
      enabled: false
      - joint : save_snapshot_db
      enabled: true
      parameters:
      compress_enabled: true
      bucket: "Snapshot"
      max_revision: 5
      - joint: index
      enabled: true
      end:
      joint: save_task
      enabled: true
      keep_404: false
      keep_redirected: false
  • name: cluster
    enabled: true

  • name: dispatch
    enabled: true
    failed_retry: false
    new_task: true
    update_task: true
    max_concurrent_fetch_tasks: 10
    fetch_size: 1000
    get_host_config_by_url: false

  • name: api
    enabled: true

  • name: web
    enabled: true
    auth:
    enabled: false
    oauth_provider: github
    oauth_authorize_url: https://github.com/login/oauth/authorize
    oauth_token_url: https://github.com/login/oauth/access_token
    client_id: 850d747174ace88ce889
    client_secret: 3d437b64e06371d6f62769320438d3dfc95a8d8e
    authorized_admin:
    - medcl
    - your_github_login_name

  • name: filter
    enabled: true

  • name: queue
    enabled: true

  • name: index
    enabled: true
    ui:
    enabled: true
    elasticsearch:
    endpoint: http://localhost:9200
    index_prefix: gopa-
    username: elastic
    password: changeme

  • name: elastic
    enabled: true
    store_enabled: true
    orm_enabled: true
    elasticsearch:
    endpoint: http://localhost:9200
    index_prefix: gopa-
    username: elastic
    password: changeme

  • name: stats
    enabled: true

  • name: statsd
    enabled: false
    host: 127.0.0.1
    port: 8125
    namespace: gopa.
    interval_in_seconds: 1

plugins:

  • name: chrome
    enabled: true
    command: "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
    debug_port: 9223

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants