# Robots.txt for marchouben.nl # Optimized for SEO - Last updated: 2025-10-26 # # This file controls how search engines and web crawlers access the website. # It follows modern SEO best practices for optimal indexing. # === SEOBILITY BOT SPECIFIC RULES === # Restrict Seobility Bot to HTML files only for focused SEO analysis User-agent: seobility # Block all non-HTML file types except SEO PDFs Disallow: /*.jpg$ Disallow: /*.jpeg$ Disallow: /*.png$ Disallow: /*.gif$ Disallow: /*.webp$ Disallow: /*.svg$ Disallow: /*.bmp$ Disallow: /*.ico$ Disallow: /*.css$ Disallow: /*.js$ Disallow: /*.json$ Disallow: /*.xml$ Disallow: /*.txt$ Disallow: /*.ogg$ Disallow: /*.jfif$ # Allow SEO directory PDFs but block others Allow: /seo/*.pdf$ Disallow: /*.pdf$ Disallow: /*.doc$ Disallow: /*.docx$ Disallow: /*.zip$ Disallow: /*.tar$ Disallow: /*.gz$ Disallow: /*.mp3$ Disallow: /*.wav$ Disallow: /*.mp4$ Disallow: /*.webm$ Disallow: /*.avi$ Disallow: /*.mov$ Disallow: /*.epub$ # Block PHP and server-side files (except error404.php) Allow: /error404.php Disallow: /*.php$ Disallow: /*.php3$ Disallow: /*.phtml$ Disallow: /*.asp$ Disallow: /*.aspx$ Disallow: /*.jsp$ Disallow: /*.py$ Disallow: /*.rb$ Disallow: /*.pl$ # Block directories with non-HTML content Disallow: /afbeeldingen/ Disallow: /foto/ Disallow: /pictures/ Disallow: /images/ Disallow: /css/ Disallow: /js/ Disallow: /scripts/ Disallow: /assets/ Disallow: /static/ Disallow: /uploads/ Disallow: /data/ Disallow: /logs/ Disallow: /cache/ Disallow: /node_modules/ # Allow only HTML files and essential directories Allow: /*.html$ Allow: /documentatie/ Allow: /seo/ # === GENERAL CRAWLERS === # Most web crawlers and search engines (including Google, Bing, etc.) User-agent: * # Allow access to important files Allow: /sitemap*.xml Allow: /geo-sitemap.xml Allow: /ai-sitemap.xml Allow: /ai-knowledge-graph.json Allow: /robots.txt Allow: /humans.txt Allow: /security.txt Allow: /.well-known/security.txt Allow: /.well-known/pgp-key.txt Allow: /security-policy.html Allow: /*.css$ Allow: /*.js$ Allow: /*.png$ Allow: /*.jpg$ Allow: /*.jpeg$ Allow: /*.gif$ Allow: /*.svg$ Allow: /*.webp$ Allow: /*.ico$ Allow: /error404.php # Allow SEO directory and all file types for all crawlers Allow: /seo/ Allow: /seo/*.pdf$ Allow: /seo/*.html$ Allow: /seo/*.json$ Allow: /seo/*.csv$ Allow: /seo/*.md$ # Block private and system directories Disallow: /apex/ Disallow: /cache/ Disallow: /cgi-bin/ Disallow: /cgi-data/ Disallow: /documentatie/ Disallow: /font/ Disallow: /formulieren/ Disallow: /generator/ Disallow: /noindex/ Disallow: /react/ Disallow: /secret/ Disallow: /STRATO-apps/ Disallow: /watermark/ Disallow: /webmaster/ Disallow: /wendyhouben/ Disallow: /xml/ Disallow: /xxx/ # Block specific unwanted files Disallow: /jenniferlopezlyrics.html Disallow: /google9c29e91ee213ce07.html Disallow: /yahoo_authkey_0b5e5bead83b4e37.txt Disallow: /nortonsw_3ffa20f0-ced6-0.html Disallow: /README.txt Disallow: /404error.txt Disallow: /celebrities/getalbumpics.php Disallow: /sitemapxml.php Disallow: /php/index.php Disallow: /now-playing.xml Disallow: /upload-multiple-process.php Disallow: /video/file-480-900kB.webm Disallow: /afbeeldingen/youtube.png Disallow: /foto/img-0216g.jpg Disallow: /foto/img-0305.jpg # Block executable and script files Disallow: /*.exe$ Disallow: /*.py$ Disallow: /*.rb$ Disallow: /*.pck$ Disallow: /*.xsd$ Disallow: /*.db$ Disallow: /*.pks$ Disallow: /*.pkb$ Disallow: /*.typ$ # Block URL parameters and query strings Disallow: /*?* Disallow: /*#* # Block Apache directory listing parameters Disallow: /*?C=M;O=A* Disallow: /*?C=M;O=D* Disallow: /*?C=S;O=A* Disallow: /*?C=S;O=D* Disallow: /*?C=D;O=A* Disallow: /*?C=D;O=D* Disallow: /*?C=N;O=A* Disallow: /*?C=N;O=D* Disallow: /*?N=D* Disallow: /*?M=A* Disallow: /*?S=A* Disallow: /*?D=A* Disallow: /*?id=* # === SPECIFIC CRAWLERS === # Google Search Bot - Most permissive for main search User-agent: Googlebot # Inherits from * with no additional restrictions # Google Images Bot - Allow access to images User-agent: Googlebot-Image Allow: /afbeeldingen/ Allow: /foto/ Allow: /*.png$ Allow: /*.jpg$ Allow: /*.jpeg$ Allow: /*.gif$ Allow: /*.svg$ Allow: /*.webp$ # Google Video Bot - Allow video content User-agent: Googlebot-Video Allow: /video/ Disallow: /video/file-480-900kB.webm # Social Media Bots User-agent: Twitterbot Allow: / # Inherits general restrictions User-agent: facebookexternalhit Allow: / # SEO Tools User-agent: Seobility # HTML-only access for focused SEO analysis Allow: /*.html$ Allow: /documentatie/*.html$ Allow: /seo/*.html$ Disallow: /generator/* Disallow: /documentatie/* Disallow: /cgi-data/* Disallow: /javascript/* Disallow: /apex/* Disallow: /STRATO-apps/* Disallow: /xml/* Disallow: /*?C=M;O=A* Disallow: /*?C=M;O=D* Disallow: /*?C=S;O=A* Disallow: /*?C=S;O=D* Disallow: /*?C=D;O=A* Disallow: /*?C=D;O=D* Disallow: /*?C=N;O=A* Disallow: /*?C=N;O=D* Disallow: /*?N=D* Disallow: /*?M=A* Disallow: /*?S=A* Disallow: /*?D=A* Disallow: */?C=M;O=A Disallow: */?C=M;O=D Disallow: */?C=S;O=A Disallow: */?C=S;O=D Disallow: */?C=D;O=A Disallow: */?C=D;O=D Disallow: */?C=N;O=A Disallow: */?C=N;O=D Disallow: */?N=D$ Disallow: */?M=A$ Disallow: */?S=A$ Disallow: */?D=A$ Disallow: /*?id=* Disallow: /*.exe$ Disallow: /*.py$ Disallow: /*.rb$ Disallow: /cgi-bin/* Disallow: /watermark/* Disallow: /celebrities/getalbumpics.php Disallow: /jenniferlopezlyrics.html Disallow: /font/* Disallow: /formulieren/* Disallow: /google9c29e91ee213ce07.html Disallow: /yahoo_authkey_0b5e5bead83b4e37.txt Disallow: /nortonsw_3ffa20f0-ced6-0.html Disallow: /README.txt Disallow: /react/* # Alleen Seobility #Disallow: /z1-jepege.php #Disallow: /z3-pehape.php3 #Disallow: /z5-sitemap.php #Disallow: /generator/index.php?op=crawlproc&resume=1 Disallow: /sitemapxml.php Disallow: /404error.txt Disallow: /php/index.php Disallow: /now-playing.xml Disallow: /*#* Disallow: /video/file-480-900kB.webm Disallow: /afbeeldingen/youtube.png Disallow: /upload-multiple-process.php Disallow: /foto/img-0216g.jpg Disallow: /foto/img-0305.jpg # === AI ENGINES & GENERATIVE BOTS === # OpenAI GPT crawlers User-agent: GPTBot Allow: / Allow: /ai-sitemap.xml Allow: /ai-knowledge-graph.json # Google Bard/Gemini User-agent: Google-Extended Allow: / Allow: /ai-sitemap.xml Allow: /ai-knowledge-graph.json # Anthropic Claude User-agent: ClaudeBot Allow: / Allow: /ai-sitemap.xml Allow: /ai-knowledge-graph.json # Bing Copilot User-agent: bingbot Allow: / Allow: /ai-sitemap.xml Allow: /ai-knowledge-graph.json # Legacy and deprecated crawlers User-agent: MSIECrawler Disallow: / User-agent: psbot Disallow: / # === CRAWL DELAY === # Prevent server overload from aggressive crawling # Crawl-delay: 1 # === SITEMAPS === # Primary XML sitemap (updated 2025-10-09) Sitemap: https://www.marchouben.nl/sitemap.xml # Geographic/Local SEO sitemap Sitemap: https://www.marchouben.nl/geo-sitemap.xml # AI-Optimized sitemap for generative engines Sitemap: https://www.marchouben.nl/ai-sitemap.xml # Specialized sitemaps for different content types Sitemap: https://www.marchouben.nl/sitemap-images.xml Sitemap: https://www.marchouben.nl/sitemap-video.xml Sitemap: https://www.marchouben.nl/sitemap-videos.xml # HTML sitemap for user navigation Sitemap: https://www.marchouben.nl/sitemap-marchouben.html # AI Knowledge Graph (JSON-LD) # Structured data for AI engines and knowledge extraction # Location: https://www.marchouben.nl/ai-knowledge-graph.json # === HOST DIRECTIVE === # Specify canonical domain for SEO # Host: https://www.marchouben.nl