#
# robots.txt
#
# This file is to prevent the crawling and indexing of certain parts
# of your site by web crawlers and spiders run by sites like Yahoo!
# and Google. By telling these "robots" where not to go on your site,
# you save bandwidth and server resources.
#
# This file will be ignored unless it is at the root of your host:
# Used:    http://example.com/robots.txt
# Ignored: http://example.com/site/robots.txt
#
# For more information about the robots.txt standard, see:
# http://www.robotstxt.org/robotstxt.html

User-agent: *
# CSS, JS, Images
Allow: /core/*.css$
Allow: /core/*.css?
Allow: /core/*.js$
Allow: /core/*.js?
Allow: /core/*.gif
Allow: /core/*.jpg
Allow: /core/*.jpeg
Allow: /core/*.png
Allow: /core/*.svg
Allow: /profiles/*.css$
Allow: /profiles/*.css?
Allow: /profiles/*.js$
Allow: /profiles/*.js?
Allow: /profiles/*.gif
Allow: /profiles/*.jpg
Allow: /profiles/*.jpeg
Allow: /profiles/*.png
Allow: /profiles/*.svg
# Directories
Disallow: /core/
Disallow: /profiles/
# Files
Disallow: /README.txt
Disallow: /web.config
# Paths (clean URLs)
Disallow: /admin/
Disallow: /comment/reply/
Disallow: /filter/tips
Disallow: /node/add/
Disallow: /search/
Disallow: /user/register
Disallow: /user/password
Disallow: /user/login
Disallow: /user/logout
Disallow: /media/oembed
Disallow: /*/media/oembed
# Paths (no clean URLs)
Disallow: /index.php/admin/
Disallow: /index.php/comment/reply/
Disallow: /index.php/filter/tips
Disallow: /index.php/node/add/
Disallow: /index.php/search/
Disallow: /index.php/user/password
Disallow: /index.php/user/register
Disallow: /index.php/user/login
Disallow: /index.php/user/logout
Disallow: /index.php/media/oembed
Disallow: /index.php/*/media/oembed

# Paths - Disallow over-crawling search pages
Disallow:  /search
Disallow:  /system/
Disallow:  /administrator/
Disallow:  /wp-content/
Disallow:  /wp-admin/
Disallow:  /cgi-bin/
Disallow:  /core/
Disallow: /wp-includes/
Disallow: /wp/
Disallow: /pantheon_healthcheck
Disallow: /pantheon_healthcheck/
Disallow: /node/add/
Disallow: /events/past-events
Disallow: /sites/www.math.upenn.edu/themes/bootstrap/
Disallow: /?q=node/add
Disallow: /calendar/day/2023*
Disallow: /calendar/day/2024*
Disallow: /calendar/day/2022*
Disallow: /sites/default/files/*.pdf
Disallow: /application/core/
#disallow file overcrawling
Disallow: /*.pdf$
Disallow: /*.xml$
Disallow: /*.php
Disallow: /node?*
Disallow: /node/?*
Disallow: /ALF_DATA/
#crawl-delay if they ignore the blocks
User-agent: Brightbot
Crawl-delay: 900000
User-agent:  PetalBot
User-agent: SemrushBot
User-agent: PingdomBot
User-agent: MauiBot
User-agent: DotBot
User-agent: AhrefsBot
User-agent: AspiegelBot
User-agent: MJ12Bot
Disallow: /
#Crawl-delay: 900
#User-Agent: *
User-agent: OpenAI-GPT
User-agent: *AI*
User-agent: claudebot
User-agent: gptbot
User-agent: ChatGPT-User
User-agent: Claude-Web
User-agent: SemrushBot
User-agent: Brightbot
User-agent: PingdomBot
User-agent: PetalBot
User-agent: Barkrowler
User-Agent: Go-http-client/1.1
User-Agent: Pingdom.com_bot_version_1.4_(http://www.pingdom.com/)
User-Agent: YandexBot
User-Agent: Brightbot 1.0
User-Agent: Ping*
User-Agent: Bright*
User-Agent: chat*
User-Agent: Pingdom.com_bot_version_1.4_(http://www.pingdom.com/)
User-Agent: Apache-HttpClient/4.5.2 (Java/1.8.0_161)
User-Agent: Claude-User
User-Agent: Claude-SearchBot
User-Agent: CCBot
User-Agent: diffbot
User-Agent: PerplexityBot
User-Agent: Perplexity‑User
User-Agent: Omgili
User-Agent: Omgilibot
User-Agent: webzio-extended
User-Agent: ImagesiftBot
User-Agent: Bytespider
User-Agent: TikTokSpider
User-Agent: Youbot
User-Agent: SemrushBot-OCOB
User-Agent: Petalbot
User-Agent: VelenPublicWebCrawler
User-Agent: TurnitinBot
User-Agent: Timpibot
User-Agent: OAI-SearchBot
User-Agent: ICC-Crawler
User-Agent: AI2Bot
User-Agent: AI2Bot-Dolma
User-Agent: DataForSeoBot
User-Agent: AwarioBot
User-Agent: AwarioSmartBot
User-Agent: AwarioRssBot
User-Agent: PanguBot
User-Agent: Kangaroo Bot
User-Agent: Sentibot
User-Agent: img2dataset
User-Agent: Meltwater
User-Agent: Seekr
User-Agent: peer39_crawler
User-Agent: cohere-ai
User-Agent: cohere-training-data-crawler
User-Agent: DuckAssistBot
User-Agent: Scrapy
User-Agent: Cotoyogi
User-Agent: aiHitBot
User-Agent: Factset_spyderbot
User-Agent: FirecrawlAgent
User-Agent: VelenPublicWebCrawler
Disallow: /
#DisallowAITraining: /
#User-Agent: *
#DisallowAITraining: /
#Content-Usage: ai=n
#Allow: /