# Heritage Web Directories - robots.txt # Purpose: Control search engine crawling and protect resources # ==================================== # SECTION 1: ALLOWED BOTS # ==================================== # --- SEARCH ENGINES --- # Major search engines that drive organic traffic User-agent: Googlebot Allow: / User-agent: Bingbot Allow: / User-agent: Slurp Allow: / User-agent: DuckDuckBot Allow: / # --- SEO TOOLS --- # Reputable SEO tools for analysis and insights User-agent: AhrefsBot Crawl-delay: 1 Allow: / User-agent: SemrushBot Crawl-delay: 1 Allow: / User-agent: Screaming Frog SEO Spider Crawl-delay: 1 Allow: / User-agent: dotbot Crawl-delay: 1 Allow: / User-agent: MJ12bot Crawl-delay: 1 Allow: / # --- AI BOTS (AMERICAN) --- # American AI companies - 1 second crawl delay User-agent: GPTBot Crawl-delay: 1 Allow: / User-agent: ChatGPT-User Crawl-delay: 1 Allow: / User-agent: ClaudeBot Crawl-delay: 1 Allow: / User-agent: anthropic-ai Crawl-delay: 1 Allow: / User-agent: CCBot Crawl-delay: 1 Allow: / User-agent: PerplexityBot Crawl-delay: 1 Allow: / User-agent: Amazonbot Crawl-delay: 1 Allow: / User-agent: Applebot Crawl-delay: 1 Allow: / # --- SOCIAL MEDIA CRAWLERS --- # Important for content sharing and previews User-agent: facebookexternalhit Allow: / User-agent: Twitterbot Allow: / User-agent: LinkedInBot Allow: / User-agent: WhatsApp Allow: / User-agent: Pinterestbot Allow: / User-agent: Snapbot Allow: / User-agent: redditbot Allow: / User-agent: Discordbot Allow: / User-agent: Slackbot Allow: / User-agent: TelegramBot Allow: / # --- DEVELOPMENT & TESTING TOOLS --- # Google's official testing and verification bots User-agent: Google-InspectionTool Allow: / User-agent: Google-Site-Verification Allow: / User-agent: Lighthouse Allow: / User-agent: Google-PageSpeed Allow: / User-agent: Chrome-Lighthouse Allow: / # --- MONITORING SERVICES --- User-agent: UptimeRobot Allow: / # ==================================== # SECTION 2: BLOCKED BOTS # ==================================== # Known bad actors and resource-intensive crawlers # --- FOREIGN SEARCH ENGINES --- # Chinese search engines User-agent: Baiduspider Disallow: / User-agent: Bytespider Disallow: / User-agent: PetalBot Disallow: / # Russian search engines User-agent: Yandexbot Disallow: / User-agent: YandexBot Disallow: / # --- AGGRESSIVE SEO BOTS --- # Less reputable or overly aggressive SEO crawlers User-agent: serpstatbot Disallow: / User-agent: SEOkicks Disallow: / User-agent: DataForSeoBot Disallow: / User-agent: BLEXBot Disallow: / User-agent: Barkrowler Disallow: / # --- OTHER UNWANTED BOTS --- User-agent: TikTokSpider Disallow: / User-agent: VelenPublicWebCrawler Disallow: / User-agent: AwarioBot Disallow: / User-agent: MojeekBot Disallow: / User-agent: SeznamBot Disallow: / User-agent: Zealbot Disallow: / User-agent: AliyunSecBot Disallow: / User-agent: Scrapy Disallow: / User-agent: proximic Disallow: / User-agent: MauiBot Disallow: / User-agent: AlphaBot Disallow: / User-agent: SputnikBot Disallow: / User-agent: ZoominfoBot Disallow: / User-agent: megaindex.ru Disallow: / # ==================================== # SECTION 3: PATH RULES FOR ALL BOTS # ==================================== # These rules apply to all user agents User-agent: * # --- BLOCKED PATHS --- # Admin and member areas Disallow: /admin/ Disallow: /members/ Disallow: /login Disallow: /api/ Disallow: /dashboard/ # Lead registration Disallow: /lead/register-step-1 # AJAX and API endpoints Disallow: /ajax/ Disallow: /api/v1/ Disallow: /api/v2/ Disallow: /lead/ajax/ Disallow: /members/ajax/ # aMember specific paths Disallow: /amember/ Disallow: /application/ Disallow: /library/ Disallow: /vendor/ # Search and filter pages (prevent duplicate content) Disallow: /*?search= Disallow: /*?filter= Disallow: /*?sort= Disallow: /*?page= Disallow: /*& Disallow: /search/ Disallow: /search? # Session and tracking parameters Disallow: /*?session= Disallow: /*?utm_ Disallow: /*?ref= Disallow: /*?source= # Technical/system files Disallow: /*.php$ Disallow: /config/ Disallow: /includes/ Disallow: /cache/ Disallow: /tmp/ Disallow: /temp/ Disallow: /logs/ Disallow: /.git/ Disallow: /.env Disallow: /composer.json Disallow: /package.json # Security-sensitive paths Disallow: /backup/ Disallow: /backups/ Disallow: /sql/ Disallow: /.well-known/ Disallow: /wp-admin/ Disallow: /wp-content/ # --- EXPLICITLY ALLOWED PATHS --- # Homepage Allow: /$ # Country pages Allow: /us/$ Allow: /canada/$ # All US state pages Allow: /us/alabama/$ Allow: /us/alaska/$ Allow: /us/arizona/$ Allow: /us/arkansas/$ Allow: /us/california/$ Allow: /us/colorado/$ Allow: /us/connecticut/$ Allow: /us/delaware/$ Allow: /us/district-of-columbia/$ Allow: /us/florida/$ Allow: /us/georgia/$ Allow: /us/hawaii/$ Allow: /us/idaho/$ Allow: /us/illinois/$ Allow: /us/indiana/$ Allow: /us/iowa/$ Allow: /us/kansas/$ Allow: /us/kentucky/$ Allow: /us/louisiana/$ Allow: /us/maine/$ Allow: /us/maryland/$ Allow: /us/massachusetts/$ Allow: /us/michigan/$ Allow: /us/minnesota/$ Allow: /us/mississippi/$ Allow: /us/missouri/$ Allow: /us/montana/$ Allow: /us/nebraska/$ Allow: /us/nevada/$ Allow: /us/new-hampshire/$ Allow: /us/new-jersey/$ Allow: /us/new-mexico/$ Allow: /us/new-york/$ Allow: /us/north-carolina/$ Allow: /us/north-dakota/$ Allow: /us/ohio/$ Allow: /us/oklahoma/$ Allow: /us/oregon/$ Allow: /us/pennsylvania/$ Allow: /us/rhode-island/$ Allow: /us/south-carolina/$ Allow: /us/south-dakota/$ Allow: /us/tennessee/$ Allow: /us/texas/$ Allow: /us/utah/$ Allow: /us/vermont/$ Allow: /us/virginia/$ Allow: /us/washington/$ Allow: /us/west-virginia/$ Allow: /us/wisconsin/$ Allow: /us/wyoming/$ # Top 40 US city pages (key for local SEO) Allow: /us/new-york/new-york/$ Allow: /us/california/los-angeles/$ Allow: /us/illinois/chicago/$ Allow: /us/texas/houston/$ Allow: /us/arizona/phoenix/$ Allow: /us/pennsylvania/philadelphia/$ Allow: /us/texas/san-antonio/$ Allow: /us/california/san-diego/$ Allow: /us/texas/dallas/$ Allow: /us/california/san-jose/$ Allow: /us/texas/austin/$ Allow: /us/florida/jacksonville/$ Allow: /us/california/san-francisco/$ Allow: /us/ohio/columbus/$ Allow: /us/north-carolina/charlotte/$ Allow: /us/indiana/indianapolis/$ Allow: /us/washington/seattle/$ Allow: /us/colorado/denver/$ Allow: /us/washington-dc/washington/$ Allow: /us/massachusetts/boston/$ Allow: /us/texas/el-paso/$ Allow: /us/tennessee/nashville/$ Allow: /us/oklahoma/oklahoma-city/$ Allow: /us/nevada/las-vegas/$ Allow: /us/kentucky/louisville/$ Allow: /us/oregon/portland/$ Allow: /us/michigan/detroit/$ Allow: /us/tennessee/memphis/$ Allow: /us/maryland/baltimore/$ Allow: /us/wisconsin/milwaukee/$ Allow: /us/new-mexico/albuquerque/$ Allow: /us/arizona/tucson/$ Allow: /us/california/fresno/$ Allow: /us/california/sacramento/$ Allow: /us/missouri/kansas-city/$ Allow: /us/arizona/mesa/$ Allow: /us/georgia/atlanta/$ Allow: /us/nebraska/omaha/$ Allow: /us/colorado/colorado-springs/$ Allow: /us/north-carolina/raleigh/$ # ==================================== # SECTION 4: DEFAULT RULES # ==================================== # Conservative rules for any unlisted bot User-agent: * Crawl-delay: 5 # Critical paths for unknown crawlers Allow: /$ Allow: /us/$ Allow: /canada/$ # Repeat key disallow rules for safety Disallow: /admin/ Disallow: /members/ Disallow: /login Disallow: /api/ Disallow: /dashboard/ Disallow: /lead/register-step-1 Disallow: /*? Disallow: /search/ Disallow: /*.php$ Disallow: /amember/ Disallow: /application/ Disallow: /vendor/ # ==================================== # SECTION 5: SITEMAP REFERENCE # ==================================== # Point to XML sitemaps for better crawling Sitemap: /sitemap.xml Sitemap: /united-states-country-sitemap.xml Sitemap: /united-states-california-state-sitemap.xml Sitemap: /united-states-new-york-state-sitemap.xml Sitemap: /united-states-texas-state-sitemap.xml Sitemap: /united-states-illinois-state-sitemap.xml Sitemap: /united-states-florida-state-sitemap.xml Sitemap: /united-states-nevada-state-sitemap.xml Sitemap: /united-states-virginia-state-sitemap.xml Sitemap: /united-states-arizona-state-sitemap.xml Sitemap: /united-states-maryland-state-sitemap.xml Sitemap: /united-states-massachusetts-state-sitemap.xml Sitemap: /united-states-washington-state-sitemap.xml Sitemap: /united-states-pennsylvania-state-sitemap.xml Sitemap: /united-states-michigan-state-sitemap.xml Sitemap: /united-states-ohio-state-sitemap.xml Sitemap: /united-states-georgia-state-sitemap.xml Sitemap: /united-states-new-jersey-state-sitemap.xml Sitemap: /united-states-oklahoma-state-sitemap.xml Sitemap: /united-states-colorado-state-sitemap.xml Sitemap: /united-states-north-carolina-state-sitemap.xml Sitemap: /united-states-hawaii-state-sitemap.xml Sitemap: /united-states-indiana-state-sitemap.xml Sitemap: /united-states-tennessee-state-sitemap.xml Sitemap: /united-states-south-carolina-state-sitemap.xml Sitemap: /united-states-louisiana-state-sitemap.xml Sitemap: /articles-sitemap.xml Sitemap: /global-sitemap.xml