diff --git a/checklists-ext/appservicewebapps_sg_checklist.en.json b/checklists-ext/appservicewebapps_sg_checklist.en.json index 8e5c28a6..f1408835 100644 --- a/checklists-ext/appservicewebapps_sg_checklist.en.json +++ b/checklists-ext/appservicewebapps_sg_checklist.en.json @@ -4,219 +4,219 @@ { "waf": "Reliability", "service": "App Service Web Apps", - "text": "(App Service plan) Choose the Premium tier of an App Service plan for production workloads. Set the maximum and minimum number of workers according to your capacity planning. For more information, see App Service plan overview.", - "description": "A premium App Service plan offers advanced scaling features and ensures redundancy if failures occur.", + "text": "(App Service) Choose the Premium v3 tier of an App Service plan for production workloads. Set the maximum and minimum number of workers according to your capacity planning. For more information, see App Service plan overview.", + "description": "A Premium v3 App Service plan provides advanced scaling features and ensures redundancy if failures occur.", "type": "recommendation", - "guid": "ad95f4ca-bd35-4ac7-a993-733c320fa4c4" + "guid": "40d87fd4-23a1-4956-87ec-5a6dc15debed" }, { "waf": "Reliability", "service": "App Service Web Apps", - "text": "(App Service plan) Enable zone redundancy. Consider provisioning more than three instances to enhance fault tolerance. Check regional support for zone redundancy because not all regions offer this feature.", + "text": "(App Service) Enable zone redundancy. Consider provisioning more than three instances to enhance fault tolerance. Check regional support for zone redundancy because not all regions have this feature.", "description": "Your application can withstand failures in a single zone when multiple instances are spread across zones. Traffic automatically shifts to healthy instances in other zones and maintains application reliability if one zone is unavailable.", "type": "recommendation", - "guid": "8a18771a-8a59-47de-905e-6e6b72f36990" + "guid": "1c6ed823-df2b-4b80-8c01-7aa06dfb4fa9" }, { "waf": "Reliability", "service": "App Service Web Apps", - "text": "(App Service) Consider disabling the application request routing (ARR) affinity feature. ARR affinity creates sticky sessions that redirect users to the node that handled their previous requests.", - "description": "Incoming requests are evenly distributed across all available nodes when you disable ARR affinity. Evenly distributed requests prevent traffic from overwhelming any single node. Requests can be seamlessly redirected to other healthy nodes if a node is unavailable. Avoid session affinity to ensure that your App Service instance remains stateless. A stateless App Service reduces complexity and ensures consistent behavior across nodes. Remove sticky sessions so that App Service can add or remove instances to scale horizontally.", + "text": "(Web Apps) Consider disabling the application request routing (ARR) affinity feature. ARR affinity creates sticky sessions that redirect users to the node that handled their previous requests.", + "description": "Incoming requests are evenly distributed across all available nodes when you disable ARR affinity. Evenly distributed requests prevent traffic from overwhelming any single node. Requests can be seamlessly redirected to other healthy nodes if a node is unavailable. Avoid session affinity to ensure that your App Service instance remains stateless. A stateless App Service instance reduces complexity and ensures consistent behavior across nodes. Remove sticky sessions so that App Service can add or remove instances to scale horizontally.", "type": "recommendation", - "guid": "5a05980f-0f3f-42c2-af59-563b037aa64c" + "guid": "264f1c63-7cb7-453a-92be-5e8f9c7bce0d" }, { "waf": "Reliability", "service": "App Service Web Apps", - "text": "(App Service) Define automatic healing rules based on request count, slow requests, memory limits, and other indicators that are part of your performance baseline. Consider this configuration as part of your scaling strategy.", + "text": "(Web Apps) Define automatic healing rules based on request count, slow requests, memory limits, and other indicators that are part of your performance baseline. Consider this configuration as part of your scaling strategy.", "description": "Automatic healing rules help your application recover automatically from unexpected problems. The configured rules trigger healing actions when thresholds are breached. Automatic healing enables automatic proactive maintenance.", "type": "recommendation", - "guid": "a92ea6eb-79b0-49f8-be2f-9ecbd56ca794" + "guid": "b22a0f20-67cc-43d1-9876-d02f31c9c626" }, { "waf": "Reliability", "service": "App Service Web Apps", - "text": "(App Service) Enable the health check feature and provide a path that responds to the health check requests.", + "text": "(Web Apps) Enable the health check feature and provide a path that responds to the health check requests.", "description": "Health checks can detect problems early. Then the system can automatically take corrective actions when a health check request fails. The load balancer routes traffic away from unhealthy instances, which directs users to healthy nodes.", "type": "recommendation", - "guid": "8804a347-b18e-4dce-88b6-9beee13dc12b" + "guid": "6fa32dd1-a6ad-4e7f-bda2-a6443c6fc8ad" }, { "waf": "Security", "service": "App Service Web Apps", - "text": "(App Service) Assign managed identities to the web app. To maintain isolation boundaries, don't share or reuse identities across applications. Make sure that you securely connect to your container registry if you use containers for your deployment.", + "text": "(Web Apps) Assign managed identities to the web app. To maintain isolation boundaries, don't share or reuse identities across applications. Make sure that you securely connect to your container registry if you use containers for your deployment.", "description": "The application retrieves secrets from Key Vault to authenticate outward communication from the application. Azure manages the identity and doesn't require you to provision or rotate any secrets. You have distinct identities for granularity of control. Distinct identities make revocation easy if an identity is compromised.", "type": "recommendation", - "guid": "ffcc54ba-464e-4ad4-b96c-de8a6959ba61" + "guid": "2b21e2cc-a872-4ee0-b595-30ec75ca161d" }, { "waf": "Security", "service": "App Service Web Apps", - "text": "(App Service) Configure custom domains for applications. Disable HTTP and only accept HTTPS requests.", - "description": "Custom domains enable secure communication through HTTPS using Transport Layer Security (TLS) protocol, which ensures the protection of sensitive data and builds user trust.", + "text": "(Web Apps) Configure custom domains for applications. Disable HTTP and only accept HTTPS requests.", + "description": "Custom domains enable secure communication through HTTPS by using TLS protocol, which helps ensure the protection of sensitive data and builds user trust.", "type": "recommendation", - "guid": "d0450dd8-5e4e-45a2-ae67-83de17e9932c" + "guid": "4beaf8ba-bd57-45a4-96e2-395463b72bb2" }, { "waf": "Security", "service": "App Service Web Apps", - "text": "(App Service) valuate whether App Service built-in authentication is the right mechanism to authenticate users that access your application. App Service built-in authentication integrates with Microsoft Entra ID. This feature handles token validation and user identity management across multiple sign-in providers and supports OpenID Connect. With this feature, you don't have authorization at a granular level, and you don't have a mechanism to test authentication.", + "text": "(Web Apps) Evaluate whether App Service built-in authentication is the right mechanism to authenticate users that access your application. App Service built-in authentication integrates with Microsoft Entra ID. This feature handles token validation and user identity management across multiple sign-in providers and supports OpenID Connect. With this feature, you don't have authorization at a granular level, and you don't have a mechanism to test authentication.", "description": "When you use this feature, you don't have to use authentication libraries in application code, which reduces complexity. The user is already authenticated when a request reaches the application.", "type": "recommendation", - "guid": "b2b6b6df-7bd0-4394-a6df-86c3a15bcaf7" + "guid": "db43eb7e-af61-4c22-a4a1-5a04ea51e30c" }, { "waf": "Security", "service": "App Service Web Apps", - "text": "(App Service) Configure the application for virtual network integration. Use private endpoints for App Service apps. Block all public traffic. Route the container image pull through the virtual network integration. All outgoing traffic from the application passes through the virtual network.", + "text": "(Web Apps) Configure the application for virtual network integration. Use private endpoints for App Service apps. Block all public traffic. Route the container image pull through the virtual network integration. All outgoing traffic from the application passes through the virtual network.", "description": "Get the security benefits of using an Azure virtual network. For example, the application can securely access resources within the network. Add a private endpoint to help protect your application. Private endpoints limit direct exposure to the public network and allow controlled access through the reverse proxy.", "type": "recommendation", - "guid": "bc1fd50b-a78a-44e6-bbd6-db1c75fa8fdd" + "guid": "d601d75c-5658-431b-a7a4-c77774c29921" }, { "waf": "Security", "service": "App Service Web Apps", - "text": "(App Service) To implement hardening: - Disable basic authentication that uses a username and password in favor of Microsoft Entra ID-based authentication. - Turn off remote debugging so that inbound ports aren't opened. - Enable CORS policies to tighten incoming requests. - Disable protocols, such as FTP.", - "description": "We don't recommend basic authentication as a secure deployment method. Microsoft Entra ID employs OAuth 2.0 token-based authentication, which offers numerous advantages and enhancements that address the limitations that are associated with basic authentication. Policies restrict access to application resources, only allow requests from specific domains, and secure cross-region requests.", + "text": "(Web Apps) To implement hardening: - Disable basic authentication that uses a username and password in favor of Microsoft Entra ID-based authentication. - Turn off remote debugging so that inbound ports aren't opened. - Enable CORS policies to tighten incoming requests. - Disable protocols, such as FTP.", + "description": "We don't recommend basic authentication as a secure deployment method. Microsoft Entra ID employs OAuth 2.0 token-based authentication, which provides numerous advantages and enhancements that address the limitations that are associated with basic authentication. Policies restrict access to application resources, only allow requests from specific domains, and secure cross-region requests.", "type": "recommendation", - "guid": "aed08f98-d32e-43c4-8879-e2a3640ec82a" + "guid": "0ebd8bfe-727a-4f64-b277-f01dee3cb922" }, { "waf": "Security", "service": "App Service Web Apps", - "text": "(App Service) Always use Key Vault references as app settings.", + "text": "(Web Apps) Always use Key Vault references as app settings.", "description": "Secrets are kept separate from your app's configuration. App settings are encrypted at rest. App Service also manages secret rotations.", "type": "recommendation", - "guid": "ed800519-baa0-449d-8c29-c5fae194116a" + "guid": "afa606b5-c783-42fa-99fa-e04c7fd0a9b9" }, { "waf": "Security", "service": "App Service Web Apps", - "text": "(App Service plan) Enable Microsoft Defender for Cloud for App Service.", + "text": "(App Service) Enable Microsoft Defender for Cloud for App Service.", "description": "Get real-time protection for resources that run in an App Service plan. Guard against threats and enhance your overall security posture.", "type": "recommendation", - "guid": "4c020315-db82-4fd8-a3da-8f2b80bd5b4f" + "guid": "bf33653c-be93-4cf9-8f87-56b026b326f5" }, { "waf": "Security", "service": "App Service Web Apps", - "text": "(App Service plan) Enable diagnostic logging and add instrumentation to your app. The logs are sent to Azure Storage accounts, Azure Event Hubs, and Log Analytics. For more information about audit log types, see Supported log types.", + "text": "(App Service) Enable diagnostic logging and add instrumentation to your app. The logs are sent to Azure Storage accounts, Azure Event Hubs, and Log Analytics. For more information about audit log types, see Supported log types.", "description": "Logging captures access patterns. It records relevant events that provide valuable insights into how users interact with an application or platform. This information is crucial for accountability, compliance, and security purposes.", "type": "recommendation", - "guid": "a1278dd3-3ed5-43b3-9544-69ccd3694db1" + "guid": "5d1c480f-435f-449e-a7cb-21de28d7c8b3" }, { "waf": "Cost", "service": "App Service Web Apps", - "text": "(App Service plan) Choose Free or Basic tiers for lower environments. We recommend these tiers for experimental use. Remove the tiers when you no longer need them.", - "description": "The Free and Basic tiers are budget-friendly compared to higher tiers. They provide a cost-effective solution for nonproduction environments that don't need the full features and performance of premium plans.", + "text": "(App Service) Choose free tiers or basic tiers for lower environments. We recommend these tiers for experimental use. Remove the tiers when you no longer need them.", + "description": "The free tiers and basic tiers are budget-friendly compared to higher tiers. They provide a cost-effective solution for nonproduction environments that don't need the full features and performance of premium plans.", "type": "recommendation", - "guid": "73ebf138-84db-4fcf-9829-c3196790bb4b" + "guid": "ee14a81d-6059-46a3-8f66-63ba4fe6d02e" }, { "waf": "Cost", "service": "App Service Web Apps", - "text": "(App Service plan) Take advantage of discounts and explore preferred pricing for: - Lower environments with dev/test plans. - Azure reservations and Azure savings plans for dedicated compute that you provision in the Premium V3 tier and App Service Environment. Use reserved instances for stable workloads that have predictable usage patterns.", - "description": "Dev/test plans provide reduced rates for Azure services, which makes them cost-effective for nonproduction environments. Use reserved instances to prepay for compute resources and get significant discounts.", + "text": "(App Service) Take advantage of discounts and explore preferred pricing for: - Lower environments with dev/test plans. - Azure reservations and Azure savings plans for dedicated compute that you provision in the Premium v3 tier and App Service Environment. Use reserved instances for stable workloads that have predictable usage patterns.", + "description": "Dev/test plans provide reduced rates for Azure services, which make them cost-effective for nonproduction environments. Use reserved instances to prepay for compute resources and get significant discounts.", "type": "recommendation", - "guid": "e4b9b5ec-6d62-4457-8225-98070a48f1f0" + "guid": "7de45b8a-d90a-4576-b918-76f498e983d4" }, { "waf": "Cost", "service": "App Service Web Apps", - "text": "(App Service) Monitor costs that App Service resources incur. Run the cost analysis tool in the Azure portal. Create budgets and alerts to notify stakeholders.", + "text": "(Web Apps) Monitor costs that App Service resources incur. Run the cost analysis tool in the Azure portal. Create budgets and alerts to notify stakeholders.", "description": "You can identify cost spikes, inefficiencies, or unexpected expenses early on. This proactive approach helps you to provide budgetary controls to prevent overspending.", "type": "recommendation", - "guid": "c3c919e3-e1ef-4566-8789-edada78d7095" + "guid": "c80e059d-89c0-4ad3-b2d5-3b4383c75d82" }, { "waf": "Cost", "service": "App Service Web Apps", - "text": "(App Service plan) Scale in when demand decreases. To scale in, define scale rules to reduce the number of instances in Azure Monitor.", + "text": "(App Service) Scale in when demand decreases. To scale in, define scale rules to reduce the number of instances in Azure Monitor.", "description": "Prevent wastage and reduce unnecessary expenses.", "type": "recommendation", - "guid": "a5257a31-c39f-4c09-85d0-d34edbfc0bbd" + "guid": "4cedd1b9-a220-4b30-9df1-1bfc3b0b9696" }, { "waf": "Operations", "service": "App Service Web Apps", - "text": "(App Service) Monitor the health of your instances and activate instance health probes. Set up a specific path for handling health probe requests.", + "text": "(Web Apps) Monitor the health of your instances and activate instance health probes. Set up a specific path for handling health probe requests.", "description": "You can detect problems promptly and take necessary actions to maintain availability and performance.", "type": "recommendation", - "guid": "29f5cff9-45d7-4ade-8e27-94ca0ba3b1d3" + "guid": "f7a614c0-9157-4e89-85e6-e343f1bfb353" }, { "waf": "Operations", "service": "App Service Web Apps", - "text": "(App Service) Enable diagnostics logs for the application and the instance. Frequent logging can slow down the performance of the system, add to storage costs, and introduce risk if you have unsecure access to logs. Follow these best practices: - Log the right level of information. - Set retention policies. - Keep an audit trail of authorized access and unauthorized attempts. - Treat logs as data and apply data-protection controls.", + "text": "(Web Apps) Enable diagnostics logs for the application and the instance. Frequent logging can slow down the performance of the system, add to storage costs, and introduce risk if you have unsecure access to logs. Follow these best practices: - Log the right level of information. - Set retention policies. - Keep an audit trail of authorized access and unauthorized attempts. - Treat logs as data and apply data-protection controls.", "description": "Diagnostic logs provide valuable insights into your app's behavior. Monitor traffic patterns and identify anomalies.", "type": "recommendation", - "guid": "9540f299-ca72-4849-a58a-78153436fc26" + "guid": "507db09c-7ac9-40b8-9ae6-46e5cece36ee" }, { "waf": "Operations", "service": "App Service Web Apps", - "text": "(App Service) Take advantage of App Service managed certificates to offload certification management to Azure.", + "text": "(Web Apps) Take advantage of App Service-managed certificates to offload certification management to Azure.", "description": "App Service automatically handles processes like certificate procurement, certificate verification, certificate renewal, and importing certificates from Key Vault. Alternatively, upload your certificate to Key Vault and authorize the App Service resource provider to access it.", "type": "recommendation", - "guid": "4a17086d-c18e-4f8e-95ec-2f2b2ec65d17" + "guid": "07c24187-22d4-4c9d-9cd4-abde70a0c773" }, { "waf": "Operations", "service": "App Service Web Apps", - "text": "(App Service plan) Validate app changes in the staging slot before you swap it with the production slot.", + "text": "(App Service) Validate app changes in the staging slot before you swap it with the production slot.", "description": "Avoid downtime and errors. Quickly revert to the last-known good state if you detect a problem after a swap.", "type": "recommendation", - "guid": "0f0b02b2-941d-45a9-973a-74a01899a16d" + "guid": "4b6f00dd-15a5-4f77-8108-bea5c5177e76" }, { "waf": "Performance", "service": "App Service Web Apps", - "text": "Enable the Always On setting when applications share a single App Service plan. App Service apps automatically unload when idle to save resources. The next request triggers a cold start, which can cause request timeouts.", + "text": "(App Service) Enable the Always On setting when applications share a single App Service plan. App Service apps automatically unload when idle to save resources. The next request triggers a cold start, which can cause request time-outs.", "description": "The application is never unloaded with Always On enabled.", "type": "recommendation", - "guid": "24d94b35-de37-4c04-9aea-dec880bf216c" + "guid": "d5cef89f-d087-4c1e-bcdd-0fefe15a0d4c" }, { "waf": "Performance", "service": "App Service Web Apps", - "text": "Consider using HTTP/2 for applications to improve protocol efficiency.", + "text": "(Web Apps) Consider using HTTP/2 for applications to improve protocol efficiency.", "description": "Choose HTTP/2 over HTTP/1.1 because HTTP/2 fully multiplexes connections, reuses connections to reduce overhead, and compresses headers to minimize data transfer.", "type": "recommendation", - "guid": "f5d46d58-7c3f-4917-a4f1-b97aa98a00c4" + "guid": "358bfdcf-3dfe-44de-ac97-35a3b21fa5ed" } ], "categories": [], "waf": [ { - "name": "reliability" + "name": "Security" }, { - "name": "Cost" + "name": "Reliability" }, { - "name": "Operations" + "name": "security" }, { - "name": "security" + "name": "reliability" }, { - "name": "Performance" + "name": "operations" }, { - "name": "Reliability" + "name": "Operations" }, { - "name": "operations" + "name": "Performance" }, { - "name": "Security" + "name": "cost" }, { - "name": "performance" + "name": "Cost" }, { - "name": "cost" + "name": "performance" } ], "yesno": [ @@ -253,6 +253,6 @@ "name": "App Service Web Apps Service Guide", "waf": "all", "state": "preview", - "timestamp": "October 20, 2024" + "timestamp": "May 04, 2025" } } \ No newline at end of file diff --git a/checklists-ext/azureapplicationgateway_sg_checklist.en.json b/checklists-ext/azureapplicationgateway_sg_checklist.en.json index f497f846..9724abb6 100644 --- a/checklists-ext/azureapplicationgateway_sg_checklist.en.json +++ b/checklists-ext/azureapplicationgateway_sg_checklist.en.json @@ -125,7 +125,7 @@ "waf": "Performance", "service": "Azure Application Gateway", "text": "Set the minimum instance count to an optimal level based on you estimated instance count, actual Application Gateway autoscaling trends, and your application patterns. Check the current compute units for the past month. This metric represents the gateway's CPU usage. To define the minimum instance count, divide the peak usage by 10. For example, if your average current compute units in the past month is 50, set the minimum instance count to five.", - "description": "For Application Gateway v2, autoscaling takes approximately six to seven minutes before the extra set of instances are ready to serve traffic. During that time, if Application Gateway has short spikes in traffic, expect transient latency or loss of traffic.", + "description": "For Application Gateway v2, autoscaling takes approximately three to five minutes before the extra set of instances are ready to serve traffic. During that time, if Application Gateway has short spikes in traffic, expect transient latency or loss of traffic.", "type": "recommendation", "guid": "1c9a7b2a-0e95-4416-8af5-4d173c48870e" }, @@ -149,34 +149,34 @@ "categories": [], "waf": [ { - "name": "reliability" + "name": "Security" }, { - "name": "Cost" + "name": "Reliability" }, { - "name": "Operations" + "name": "security" }, { - "name": "security" + "name": "reliability" }, { - "name": "Performance" + "name": "operations" }, { - "name": "Reliability" + "name": "Operations" }, { - "name": "operations" + "name": "Performance" }, { - "name": "Security" + "name": "cost" }, { - "name": "performance" + "name": "Cost" }, { - "name": "cost" + "name": "performance" } ], "yesno": [ @@ -213,6 +213,6 @@ "name": "Azure Application Gateway Service Guide", "waf": "all", "state": "preview", - "timestamp": "October 20, 2024" + "timestamp": "May 04, 2025" } } \ No newline at end of file diff --git a/checklists-ext/azureblobstorage_sg_checklist.en.json b/checklists-ext/azureblobstorage_sg_checklist.en.json index 6ad0b389..f0658f29 100644 --- a/checklists-ext/azureblobstorage_sg_checklist.en.json +++ b/checklists-ext/azureblobstorage_sg_checklist.en.json @@ -25,6 +25,14 @@ "type": "recommendation", "guid": "d095b8f1-86f8-4345-a730-c079330d8a19" }, + { + "waf": "Reliability", + "service": "Azure Blob Storage", + "text": "Configure vaulted backup for Azure Blob as a part of your backup strategy.", + "description": "Vaulted backup enables you to protect the block blob data from ransomware, other malicious attacks, or source data loss. The data is copied and stored in the Backup vault (an offsite copy of data) that can be retained for up to 10 years. If any data loss happens on the source account, you can trigger a restore to an alternate account and get access to your data. Learn more about the supportability for vaulted backup using Azure Backup.", + "type": "recommendation", + "guid": "1d3f2471-54ef-4166-9553-f6819f7a3bc2" + }, { "waf": "Security", "service": "Azure Blob Storage", @@ -213,34 +221,34 @@ "categories": [], "waf": [ { - "name": "reliability" + "name": "Security" }, { - "name": "Cost" + "name": "Reliability" }, { - "name": "Operations" + "name": "security" }, { - "name": "security" + "name": "reliability" }, { - "name": "Performance" + "name": "operations" }, { - "name": "Reliability" + "name": "Operations" }, { - "name": "operations" + "name": "Performance" }, { - "name": "Security" + "name": "cost" }, { - "name": "performance" + "name": "Cost" }, { - "name": "cost" + "name": "performance" } ], "yesno": [ @@ -277,6 +285,6 @@ "name": "Azure Blob Storage Service Guide", "waf": "all", "state": "preview", - "timestamp": "October 20, 2024" + "timestamp": "May 04, 2025" } } \ No newline at end of file diff --git a/checklists-ext/azureexpressroute_sg_checklist.en.json b/checklists-ext/azureexpressroute_sg_checklist.en.json index c87889e4..06762176 100644 --- a/checklists-ext/azureexpressroute_sg_checklist.en.json +++ b/checklists-ext/azureexpressroute_sg_checklist.en.json @@ -4,235 +4,347 @@ { "waf": "Reliability", "service": "Azure Expressroute", - "text": "Plan for ExpressRoute circuit or ExpressRoute Direct", - "description": "During the initial planning phase, you want to decide whether you want to configure an ExpressRoute circuit or an ExpressRoute Direct connection. An ExpressRoute circuit allows a private dedicated connection into Azure with the help of a connectivity provider. ExpressRoute Direct allows you to extend the on-premises network directly into the Microsoft network at a peering location. You also need to identify the bandwidth requirement and the SKU type requirement for your business needs.", + "text": "Anticipate and mitigate potential failures when you design and architect Azure ExpressRoute.", + "description": "Anticipating failures leads to the design of a more robust and resilient network architecture that can withstand various failure scenarios.", "type": "recommendation", - "guid": "e89fb4a5-9cdd-4fd5-bb8b-388dee7bc217" + "guid": "81952f45-2485-496e-8e14-c5a1151edc86" }, { "waf": "Reliability", "service": "Azure Expressroute", - "text": "Plan for geo-redundant circuits", - "description": "To plan for disaster recovery, set up ExpressRoute circuits in more than one peering locations. You can create circuits in peering locations in the same metro or different metro and choose to work with different service providers for diverse paths through each circuit. For more information, see Designing for disaster recovery and Designing for high availability.", + "text": "Plan for site resiliency. For Maximum or High resiliency, plan to have multiple paths between the on-premises edge and the peering locations (provider/Microsoft edge locations). For Maximum Resiliency, configure multiple circuits to different peering locations. For High Resiliency, configure a circuit between multiple peering locations within the same metropolitan area (also referred to as ExpressRoute Metro) from the on-premises network.", + "description": "By having multiple paths between the on-premises edge and the peering locations, the network can continue to operate even if one path fails. This redundancy is crucial for maintaining continuous connectivity and minimizing downtime.", "type": "recommendation", - "guid": "14b83764-dab1-4741-85ee-7b3cf55cde49" + "guid": "5065a917-f8cd-42c5-8771-440f6e0ac4df" }, { "waf": "Reliability", "service": "Azure Expressroute", - "text": "Plan for Active-Active connectivity", - "description": "This mode provides higher availability of your Expressroute connections. It's also recommended to configure BFD for faster failover if there's a link failure on a connection.", + "text": "Plan for multiple region and availability zones.", + "description": "Availability zones are physically separate locations within a region, providing fault isolation. This means that failures in one zone don't affect the others, enhancing overall system reliability.", "type": "recommendation", - "guid": "f28fea39-a9e2-45ef-a711-997456c3d42c" + "guid": "aef82860-c236-40cf-917d-f5157ef59662" }, { "waf": "Reliability", "service": "Azure Expressroute", - "text": "Planning for Virtual Network Gateways", - "description": "Create availability zone aware Virtual Network Gateway for higher resiliency and plan for Virtual Network Gateways in different regions for resiliency, disaster recovery, and high availability.", + "text": "Plan for ExpressRoute circuit or ExpressRoute Direct. During the initial planning phase, you want to decide whether you want to configure an ExpressRoute circuit or an ExpressRoute Direct connection. You also need to identify the bandwidth requirement and the SKU type requirement for your business needs.", + "description": "An ExpressRoute circuit allows a private dedicated connection into Azure with the help of a connectivity provider. ExpressRoute Direct allows you to extend on-premises network directly into the Microsoft network at a peering location.", "type": "recommendation", - "guid": "8e29e63c-2da5-4242-8a86-c7083b231b0f" + "guid": "90abe901-2080-4a86-8390-c72709fb33eb" }, { "waf": "Reliability", "service": "Azure Expressroute", - "text": "Monitor circuits and gateway health", - "description": "Set up monitoring and alerts for ExpressRoute circuits and Virtual Network Gateway health based on various metrics available.", + "text": "Choose the right circuit SKU for redundancy by using geographic expansion. The Local, Standard, and Premium SKUs offer different levels of connectivity, access, and performance capabilities. Premium SKU provides the highest level of redundancy with global connectivity to any Azure region worldwide.", + "description": "Choosing the right circuit SKU ensures that you have the appropriate level of redundancy and connectivity for your workloads.", "type": "recommendation", - "guid": "0367cde8-2954-4b20-8be0-fb2b7e50eb91" + "guid": "43ba1711-d33a-44f7-8246-43c625c8f9cb" }, { "waf": "Reliability", "service": "Azure Expressroute", - "text": "Enable service health", - "description": "ExpressRoute uses service health to notify about planned and unplanned maintenance. Configuring service health will notify you about changes made to your ExpressRoute circuits.", + "text": "Plan for Active-Active connectivity. To improve high availability, redundancy, and resiliency, we recommend operating both connections of an ExpressRoute circuit in active-active mode. Additionally, configure Bi-Directional Forwarding Detection (BFD) over both private and Microsoft Peering for faster failover during a link failure.", + "description": "Active-active mode mode provides higher availability of your ExpressRoute connections. BFD provides rapid detection of link failures, enabling quicker failover to backup paths. This minimizes downtime and ensures continuous connectivity.", "type": "recommendation", - "guid": "c7bf09c0-317f-4f7f-be8d-3d74444757c8" + "guid": "e021493f-08b1-4cda-9f6c-ec2301946bfd" + }, + { + "waf": "Reliability", + "service": "Azure Expressroute", + "text": "Plan for geo-redundant circuits.", + "description": "There are scenarios where an ExpressRoute peering location or an entire regional service might experience degradation. Geo-redundancy enhances disaster recovery and high availability by ensuring that there are multiple, geographically diverse paths between on-premises networks and Azure. This reduces the risk of a single point of failure causing a network outage, thereby increasing the reliability and availability of the connection.", + "type": "recommendation", + "guid": "6f4b52a7-d344-4895-8edd-571cb93198da" + }, + { + "waf": "Reliability", + "service": "Azure Expressroute", + "text": "With ExpressRoute Global Reach you can link ExpressRoute circuits together to make a private network between your on-premises networks. Configure ExpressRoute Global Reach on your ExpressRoute circuit Premium SKU.", + "description": "ExpressRoute Global Reach provides an additional layer of redundancy by linking your on-premises networks across different geographical locations directly through the Azure backbone network. This ensures that your network remains connected and operational even if one Azure region becomes unavailable.", + "type": "recommendation", + "guid": "affabc60-3d1d-42b3-9fec-26809b287c2e" + }, + { + "waf": "Reliability", + "service": "Azure Expressroute", + "text": "Choose different ExpressRoute service providers for each circuit.", + "description": "Diversity in service providers minimizes the risk of network downtime due to a single provider's outage. By choosing different service providers for each circuit, you can ensure that your network remains operational even if one provider experiences an outage. This redundancy is essential for maintaining continuous connectivity and minimizing downtime.", + "type": "recommendation", + "guid": "71f805c4-c1a4-462f-8ca8-69da654681a4" + }, + { + "waf": "Reliability", + "service": "Azure Expressroute", + "text": "Configure Site-to-Site VPN over Microsoft peering as a backup to ExpressRoute private peering. Site-to-site VPN provides an additional layer of redundancy and ensures that your network remains operational even if the ExpressRoute connection experiences an outage.", + "description": "By configuring a site-to-site VPN as a backup to ExpressRoute private peering, you can maintain continuous connectivity and minimize downtime.", + "type": "recommendation", + "guid": "e5086996-8a84-4630-8d2e-c2577a24f6a0" + }, + { + "waf": "Reliability", + "service": "Azure Expressroute", + "text": "Planning for zone-redundant Virtual Network Gateways. Select the right ExpressRoute Virtual Network Gateway SKU to reflect the correct performance and throughput for your business. Consider deploying a scalable virtual network gateway that allows you to achieve 40-Gbps connectivity and will auto-scale based on your required throughput. Deploy ExpressRoute virtual network gateways that are zone-redundant for maximum resiliency and redundancy across Availability Zones.", + "description": "Choosing the appropriate SKU ensures that the gateway can handle the required performance and throughput for your business needs. A scalable virtual network gateway autoscales based on required throughput, allowing the network to adapt to changing demands. This flexibility helps maintain performance during peak usage times and prevents overloading. Additionally, deploying zone-redundant virtual network gateways ensures that the network remains operational even if one availability zone experiences an outage, enhancing overall reliability and resiliency.", + "type": "recommendation", + "guid": "cfdfdc1b-3965-4c7b-8d30-f2899afdf23f" + }, + { + "waf": "Reliability", + "service": "Azure Expressroute", + "text": "Conduct reliability testing with the Azure Connectivity Toolkit to ensure that the network design is resilient and can withstand failures.", + "description": "Reliability testing helps identify potential issues and weaknesses in the network design, allowing you to address them proactively. By conducting reliability testing, you can ensure that the network is robust and resilient, minimizing downtime and ensuring continuous connectivity.", + "type": "recommendation", + "guid": "ff042f47-6e3d-49fc-ad0c-103144d8bdda" + }, + { + "waf": "Reliability", + "service": "Azure Expressroute", + "text": "Configure monitoring and alerts for ExpressRoute circuits, peering, ports, and Virtual Network Gateway resource health based on various available metrics. This helps in proactively managing and maintaining the health of your network. Use Network Insights for ExpressRoute to visualize topological maps and health dashboards, providing a clear view of your configurations and their status.", + "description": "By setting up monitoring and alerts based on various metrics, you can proactively detect and address issues such as increased latency, traffic drops, or circuit downtimes before they impact your services.", + "type": "recommendation", + "guid": "561a49cf-9c9e-4a72-87cb-a1586405c5bf" + }, + { + "waf": "Reliability", + "service": "Azure Expressroute", + "text": "Configure service health to notify you about planned and unplanned maintenance. Configuring service health notifies you about changes made to your ExpressRoute circuits.", + "description": "With Service Health, you can view planned and past maintenance in the Azure portal along with configuring alerts and notifications that best suits your needs.", + "type": "recommendation", + "guid": "5cf6bfbb-477a-461f-9a52-0ff5da6c002b" + }, + { + "waf": "Security", + "service": "Azure Expressroute", + "text": "Leverage Azure Security Baseline for ExpressRoute. This security baseline applies guidance from the Microsoft Cloud Security Benchmark version 1.0 to ExpressRoute.", + "description": "The content is organized by the security controls defined in the benchmark and includes related guidance specific to ExpressRoute.", + "type": "recommendation", + "guid": "a534ebbc-4500-4b46-94b4-b865c92a7cf1" }, { "waf": "Security", "service": "Azure Expressroute", - "text": "Configure Activity log to send logs to archive", - "description": "Activity logs provide insights into operations that were performed at the subscription level for ExpressRoute resources. With Activity logs, you can determine who and when an operation was performed at the control plane. Data retention is only 90 days and required to be stored in Log Analytics, Event Hubs or a storage account for archive.", + "text": "Implement Azure Role-Based Access Control (RBAC) to control who can manage ExpressRoute resources such as ExpressRoute circuits and gateways.", + "description": "By providing granular access management to resources, you can maintain an inventory of administrative accounts with access to ExpressRoute resources and ensure that only authorized users can perform specific actions.", "type": "recommendation", - "guid": "b1f76928-0fc3-407e-8658-f93f2812873f" + "guid": "f240efc1-27f6-497f-b7f4-4965c8985257" }, { "waf": "Security", "service": "Azure Expressroute", - "text": "Maintain inventory of administrative accounts", - "description": "Use Azure RBAC to configure roles to limit user accounts that can add, update, or delete peering configuration on an ExpressRoute circuit.", + "text": "Configure MACsec for ExpressRoute Direct ports.", + "description": "MACsec (Media Access Control security) enhances security by encrypting data, ensuring data integrity, protecting vulnerable protocols. It secures protocols that are typically not protected on Ethernet links, such as ARP, DHCP, and LACP, thereby preventing potential security threats targeting these protocols.", "type": "recommendation", - "guid": "61fced7c-71af-4061-a73a-b880e8ee4f78" + "guid": "ba4ab7e4-bb65-410c-be00-bb6e66915e87" }, { "waf": "Security", "service": "Azure Expressroute", - "text": "Configure MD5 hash on ExpressRoute circuit", - "description": "During configuration of private peering or Microsoft peering, apply an MD5 hash to secure messages between the on-premises route and the MSEE routers.", + "text": "Encrypt traffic using IPsec (Internet Protocol Security) for ExpressRoute private peering or configure a tunnel using private peering.", + "description": "IPsec encrypts data at the network layer (Layer 3) and enhances security by providing encryption, authentication, integrity protection, and compliance. This ensures that data transmitted over ExpressRoute circuits is secure and protected from unauthorized access and tampering.", "type": "recommendation", - "guid": "7091a086-8128-45f8-81e6-c93548433b87" + "guid": "284de61f-d042-4bc7-88b9-27b0ecdc39de" }, { "waf": "Security", "service": "Azure Expressroute", - "text": "Configure MACSec for ExpressRoute Direct resources", - "description": "Media Access Control security is a point-to-point security at the data link layer. ExpressRoute Direct supports configuring MACSec to prevent security threats to protocols such as ARP, DHCP, LACP not normally secured on the Ethernet link. For more information on how to configure MACSec, see MACSec for ExpressRoute Direct ports.", + "text": "Configure MD5 hash on ExpressRoute circuit during configuration of private peering or Microsoft peering to secure messages between the on-premises route and the MSEE routers.", + "description": "By generating an MD5 hash of the data before transmission and comparing it with the hash generated after reception, you can ensure that the data hasn't been tampered with during transit.", "type": "recommendation", - "guid": "90d32454-fcb9-496d-a411-166a2fe50b6b" + "guid": "7403df09-ca0c-4505-be86-5a21b0ed7b3f" }, { "waf": "Security", "service": "Azure Expressroute", - "text": "Encrypt traffic using IPsec", - "description": "Configure a Site-to-site VPN tunnel over your ExpressRoute circuit to encrypt data transferring between your on-premises network and Azure virtual network. You can configure a tunnel using private peering or using Microsoft peering.", + "text": "Configure activity logs and send logs an to archive. Data retention is only 90 days and required to be stored in Log Analytics, Event Hubs or a storage account for archive. For more information about Activity logs in ExpressRoute, see Monitor Azure ExpressRoute.", + "description": "Activity logs provide insights into operations that were performed at the subscription level for ExpressRoute resources. With Activity logs, you can determine who and when an operation was performed at the control plane.", "type": "recommendation", - "guid": "ef702434-e1ce-4c4b-a2a6-553c1d58f881" + "guid": "e89f10b4-091c-4cb5-accf-9d7c9470942b" }, { "waf": "Cost", "service": "Azure Expressroute", - "text": "Familiarize yourself with ExpressRoute pricing", - "description": "For information about ExpressRoute pricing, see Understand pricing for Azure ExpressRoute. You can also use the Pricing calculator.Ensure that the options are adequately sized to meet the capacity demand and deliver expected performance without wasting resources.", + "text": "Familiarize yourself with ExpressRoute pricing. Use the Azure Pricing Calculator to estimate the cost. ExpressRoute Direct has a monthly port fee that includes the circuit fee for Local and Standard SKU ExpressRoute circuits. For Premium SKU circuits, there's an additional circuit fee. Outbound data transfer is charged per GB used, depending on the zone number of the peering location. The outbound data charge applies only to Standard and Premium SKUs. For more information, see plan and manage costs for Azure ExpressRoute.", + "description": "Understanding ExpressRoute pricing enables better cost management, informed decision-making, avoidance of unexpected charges and maximization of value.", "type": "recommendation", - "guid": "8ae8772a-7131-42f9-9d2f-ce2aa5bcdd2b" + "guid": "4481df27-a5d4-4e29-914e-48e706b93bc2" }, { "waf": "Cost", "service": "Azure Expressroute", - "text": "Determine SKU and bandwidth required", - "description": "The way you're charged for your ExpressRoute usage varies between the three different SKU types. With Local SKU, you're automatically charged with an Unlimited data plan. With Standard and Premium SKU, you can select between a Metered or an Unlimited data plan. All ingress data are free of charge except when using the Global Reach add-on. It's important to understand which SKU types and data plan works best for your workload to best optimize cost and budget. For more information resizing ExpressRoute circuit, see upgrading ExpressRoute circuit bandwidth.", + "text": "Determine circuit SKU and bandwidth required. The way you're charged for your ExpressRoute usage varies between the three different SKU types. With the Local SKU, you're automatically charged with an Unlimited data plan. With the Standard and Premium SKUs, you can choose between a Metered or an Unlimited data plan. All ingress data is free of charge, except when using the Global Reach add-on, which incurs additional costs for data transfer between different geographical locations. It's important to review and resize your ExpressRoute circuit.", + "description": "It's important to understand which SKU types and data plan works best for your workload to best optimize cost and budget.", "type": "recommendation", - "guid": "18ef72cd-862c-43e8-b9ee-921fb5f079f0" + "guid": "530b911f-cb08-490b-bd4c-d41b2c3514b2" }, { "waf": "Cost", "service": "Azure Expressroute", - "text": "Determine the ExpressRoute virtual network gateway size", - "description": "ExpressRoute virtual network gateways are used to pass traffic into a virtual network over private peering. Review the performance and scale needs of your preferred Virtual Network Gateway SKU. Select the appropriate gateway SKU on your on-premises to Azure workload.", + "text": "Determine the size of the ExpressRoute Virtual Network Gateway. ExpressRoute virtual network gateways are used to pass traffic into a virtual network over private peering. Select the appropriate gateway SKU on your on-premises to Azure workload. Understand ExpressRoute Gateway pricing based on region and type. ExpressRoute Gateways are charged at an hourly rate plus the cost of an ExpressRoute circuit. Configure scalable ExpressRoute gateways to set minimum and maximum scale units for the gateway, which auto-scales based on active bandwidth or flow count. See ExpressRoute pricing and select ExpressRoute Gateways to see rates for different gateway SKUs.", + "description": "This benefits you by enabling right-sizing of resources, providing flexibility to scale, optimizing performance, and supporting proactive cost management. This approach ensures that you're using resources efficiently and cost-effectively.", "type": "recommendation", - "guid": "3655e3bc-9d56-47f6-b7bc-c1a568aa3c8a" + "guid": "85a78212-c440-449d-b331-ad5202a7b9c8" }, { "waf": "Cost", "service": "Azure Expressroute", - "text": "Monitor cost and create budget alerts", - "description": "Monitor the cost of your ExpressRoute circuit and create alerts for spending anomalies and overspending risks. For more information, see Monitoring ExpressRoute costs.", + "text": "Monitor costs and create budget alerts. Monitor the cost of your ExpressRoute circuit and create alerts for spending anomalies and overspending risks.", + "description": "Monitoring and alerts provide you with tools to control spending, enhance financial planning, ensure accountability, and optimize resource usage.", "type": "recommendation", - "guid": "3ade6188-d99d-47de-99e7-639136d3ac36" + "guid": "6a9c44c8-bd26-4580-89b4-e35256162c7f" }, { "waf": "Cost", "service": "Azure Expressroute", - "text": "Deprovision and delete ExpressRoute circuits no longer in use.", - "description": "ExpressRoute circuits are charged from the moment they're created. To reduce unnecessary cost, deprovision the circuit with the service provider and delete the ExpressRoute circuit from your subscription. For steps on how to remove an ExpressRoute circuit, see Deprovisioning an ExpressRoute circuit.", + "text": "Deprovision and delete unused ExpressRoute circuits. Azure Advisor can detect ExpressRoute circuits that have been deployed for a significant time but have a provider status of Not Provisioned.", + "description": "ExpressRoute circuits are charged from the moment they're created. To reduce unnecessary cost, deprovision the circuit with the service provider and delete the ExpressRoute circuit from your subscription.", "type": "recommendation", - "guid": "ca8f1e36-5762-4510-b0af-5a073cc9185a" + "guid": "c3681305-d1cb-4c55-baf0-ff8cfcf8f4fe" }, { "waf": "Operations", "service": "Azure Expressroute", - "text": "Configure connection monitoring", - "description": "Connection monitoring allows you to monitor connectivity between your on-premises resources and Azure over the ExpressRoute private peering and Microsoft peering connection. Connection monitor can detect networking issues by identifying where along the network path the problem is and help you quickly resolve configuration or hardware failures.", + "text": "Choose the closest peering locations to your on-premises network to reduce latency and costs.", + "description": "By choosing the closest peering location to your on-premises network, you can reduce latency and costs, ensuring optimal performance and cost-effectiveness.", "type": "recommendation", - "guid": "c1dcf762-0191-4963-89d7-3cc1df34b653" + "guid": "53b07e05-6c42-4ce2-ba65-e57a3f63c0ed" }, { "waf": "Operations", "service": "Azure Expressroute", - "text": "Configure Service Health", - "description": "Set up Service Health notifications to alert when planned and upcoming maintenance is happening to all ExpressRoute circuits in your subscription. Service Health also displays past maintenance along with RCA if an unplanned maintenance were to occur.", + "text": "Configure Connection Monitor between your on-premises and Azure network.", + "description": "Connection Monitor can detect networking issues by identifying where along the network path the problem is and help you quickly resolve configuration or hardware failures. Connection Monitor is part of Azure Monitor logs.", "type": "recommendation", - "guid": "60f840b9-1818-4967-a115-68e90f47daf3" + "guid": "61ec5c65-1446-4d31-82f4-971da30d2784" }, { "waf": "Operations", "service": "Azure Expressroute", - "text": "Review metrics with Network Insights", - "description": "ExpressRoute Insights with Network Insights allow you to review and analyze ExpressRoute circuits, gateways, connections metrics and health dashboards. ExpressRoute Insights also provide a topology view of your ExpressRoute connections where you can view details of your peering components all in a single place.Metrics available:- Availability- Throughput- Gateway metrics", + "text": "Configure dynamic routing your Microsoft peering enabled ExpressRoute circuit.", + "description": "Dynamic routing allows for more efficient and flexible routing, ensuring optimal path selection and automatic updates to routing tables in response to network changes.", "type": "recommendation", - "guid": "c3c5fe66-1901-4786-99ea-845944bd6ca3" + "guid": "9745a9fd-cb3e-40b3-a3c4-ed992dee492c" }, { "waf": "Operations", "service": "Azure Expressroute", - "text": "Review ExpressRoute resource metrics", - "description": "ExpressRoute uses Azure Monitor to collect metrics and create alerts base on your configuration. Metrics are collected for ExpressRoute circuits, ExpressRoute gateways, ExpressRoute gateway connections, and ExpressRoute Direct. These metrics are useful for diagnosing connectivity problems and understanding the performance of your ExpressRoute connection.", + "text": "Configure Service Health notifications to alert you when planned and upcoming maintenance is scheduled for all ExpressRoute circuits in your subscription. Service Health also displays past maintenance events along with Root Cause Analysis (RCA) if unplanned maintenance event occurs.", + "description": "Service Health notifications provide timely alerts about planned and unplanned maintenance, outages, and early warnings about potential issues. This allows you to stay informed about the status of your ExpressRoute circuits.", "type": "recommendation", - "guid": "03914313-6287-41c4-9e4a-4980c2ee3aa9" + "guid": "f3965cc1-e28d-4a8d-8972-e8fa0c8e450d" + }, + { + "waf": "Operations", + "service": "Azure Expressroute", + "text": "Configure Traffic Collector for ExpressRoute", + "description": "ExpressRoute Traffic Collector enables the sampling of network flows over your ExpressRoute circuits. It supports both Private peering and Microsoft peering, providing near real-time visibility into network throughput and performance.", + "type": "recommendation", + "guid": "2c1275b7-f159-43c8-aa73-60b612e7da29" + }, + { + "waf": "Operations", + "service": "Azure Expressroute", + "text": "Review metrics with Network Insights. ExpressRoute Insights with Network Insights allow you to review and analyze ExpressRoute circuits, gateways, connections metrics and health dashboards. ExpressRoute Insights also provide a topology view of your ExpressRoute connections where you can view details of your peering components all in a single place.", + "description": "Network Insights offers a centralized platform to monitor various metrics across ExpressRoute circuits, gateways, and connections, providing a comprehensive view of network health and performance.", + "type": "recommendation", + "guid": "4d42a1ab-3855-4be4-8e33-65b78b5213e0" + }, + { + "waf": "Operations", + "service": "Azure Expressroute", + "text": "Review ExpressRoute resource metrics. Use Azure Monitor to collect metrics and create alerts based on your configuration.", + "description": "Metrics are collected for ExpressRoute circuits, ExpressRoute gateways, ExpressRoute gateway connections, and ExpressRoute Direct. These metrics are useful for diagnosing connectivity problems and understanding the performance of your ExpressRoute connection.", + "type": "recommendation", + "guid": "73055347-f8bf-46a5-b0d3-e9a2370786eb" + }, + { + "waf": "Operations", + "service": "Azure Expressroute", + "text": "Review ExpressRoute metrics and create alerts. ExpressRoute uses Azure Monitor to collect metrics and create alerts based on your configuration. Follow the recommendations for designing and creating a monitoring system to implement your monitoring strategy for ExpressRoute and your workloads.", + "description": "Metrics are collected for ExpressRoute circuits, ExpressRoute gateways, ExpressRoute gateway connections, and ExpressRoute Direct. These metrics are useful for diagnosing connectivity problems and understanding the performance of your ExpressRoute connection.", + "type": "recommendation", + "guid": "2839da68-97a8-4aa8-be8f-a1ab534e0e4c" }, { "waf": "Performance", "service": "Azure Expressroute", - "text": "Test ExpressRoute gateway performance to meet work load requirements.", - "description": "Use Azure Connectivity Toolkit to test performance across your ExpressRoute circuit to understand bandwidth capacity and latency of your network connection.", + "text": "Test ExpressRoute gateway performances to meet work load requirements with the Azure Connectivity Toolkit. Schedule bandwidth-intensive operations such as backups and performance testing at times of low production traffic.", + "description": "The toolkit provides user-friendly tools and interfaces that simplify the process of configuring and managing network connections to Azure. The toolkit includes tools to optimize network performance, ensuring efficient and reliable connectivity to Azure services.", "type": "recommendation", - "guid": "07fac8bb-13c5-44b8-a4e8-7e2ed1a84b48" + "guid": "c53182f9-8cba-4d3e-bd8e-cb03518df93a" }, { "waf": "Performance", "service": "Azure Expressroute", - "text": "Increase the size of the ExpressRoute gateway.", - "description": "Upgrade to a higher gateway SKU for improved throughput performance between on-premises and Azure environment.", + "text": "Plan for scaling of ExpressRoute circuits. Upgrade your ExpressRoute circuit bandwidth to meet your production workload requirements. Circuit bandwidth is shared between all virtual networks connected to the ExpressRoute circuit. Depending on your workload, one or more virtual networks can use up all the bandwidth on the circuit. For more information, see ExpressRoute limits.", + "description": "Upgrading the bandwidth ensures that the network can handle increasing data volumes and more users without compromising performance.", "type": "recommendation", - "guid": "7f788e1a-71dd-4a3e-b19f-6bd8ef8ad815" + "guid": "48522cea-7684-44e5-8349-76cd14057f97" }, { "waf": "Performance", "service": "Azure Expressroute", - "text": "Upgrade ExpressRoute circuit bandwidth", - "description": "Upgrade your circuit bandwidth to meet your work load requirements. Circuit bandwidth is shared between all virtual networks connected to the ExpressRoute circuit. Depending on your work load, one or more virtual networks can use up all the bandwidth on the circuit.", + "text": "Plan for scaling of ExpressRoute Virtual Network Gateway. Upgrade your ExpressRoute Virtual Network Gateway SKU to meet your production workload requirements.", + "description": "Upgrading to a larger gateway SKU provides higher throughput capabilities, allowing more data to be transferred between on-premises networks and Azure more quickly. A larger gateway can manage more simultaneous connections and higher volumes of traffic, reducing the likelihood of network congestion and bottlenecks.", "type": "recommendation", - "guid": "5cd4120c-3a0b-42d0-8114-2663988f43b8" + "guid": "48b29daa-7a24-437a-914b-16b6e8a9270e" }, { "waf": "Performance", "service": "Azure Expressroute", - "text": "Enable ExpressRoute FastPath for higher throughput", - "description": "If you're using an Ultra performance or an ErGW3AZ virtual network gateway, you can enable FastPath to improve the data path performance between your on-premises network and Azure virtual network.", + "text": "Configure Scalable Gateways to automatically scale for performance.", + "description": "Scalable Gateways allows you to scale up and down automatically with your gateway instances to accommodate performance needs. ErGwScale SKU also enables you to achieve 40-Gbps connectivity to virtual machines and Private Endpoints within the virtual network.", "type": "recommendation", - "guid": "ef778b99-5006-4adf-bef1-d709456a4c51" + "guid": "c30edec2-c955-4365-8337-d74c28757419" }, { "waf": "Performance", "service": "Azure Expressroute", - "text": "Monitor ExpressRoute circuit and gateway metrics", - "description": "Set up alerts base on ExpressRoute metrics to proactively notify you when a certain threshold is met. These metrics are useful to understand anomalies that can happen with your ExpressRoute connection such as outages and maintenance happening to your ExpressRoute circuits.", + "text": "Enable ExpressRoute FastPath for higher throughput on your virtual network gateway.", + "description": "This feature improves the data path performance between your on-premises network and your virtual network resources by bypassing the gateway. As business needs grow, FastPath provides the necessary bandwidth and performance to support increasing data volumes and more users without compromising performance. Enabling FastPath ensures that the network can handle future expansions and new applications, providing long-term performance efficiency.", "type": "recommendation", - "guid": "4a0e8302-106d-48a3-abc4-9e4875a48309" + "guid": "8af58f0b-c1c8-4224-9968-8ea5f0b29580" + }, + { + "waf": "Performance", + "service": "Azure Expressroute", + "text": "Monitor Monitor ExpressRoute circuit, port, and gateway metrics. Configure alerts for ExpressRoute metrics to proactively notify you when a certain threshold is met. ExpressRoute circuit metrics supports metrics such as Arp Availability, BitsInPerSecond, DroppedInBitsPerSecond. ExpressRoute port metrics supports metrics such as AdminState, BitsInPerSecond, and FastPathRoutesCount. ExpressRoute Gateway metrics supports metrics such as Bits In Per Second, Active Flows, and Count Of Routes Advertised to Peer.
Monitor performance targets with Connection Monitor.", + "description": "ExpressRoute circuit, port, and gateway metrics are useful to understand anomalies that can happen with your ExpressRoute connection such as outages and maintenance happening to your ExpressRoute circuits. Connection Monitor can detect networking issues by identifying where along the network path the problem is and help you quickly resolve configuration or hardware failures.", + "type": "recommendation", + "guid": "2cda9a37-d763-40ad-850d-063cf8c6b368" } ], "categories": [], "waf": [ { - "name": "reliability" + "name": "Security" }, { - "name": "Cost" + "name": "Reliability" }, { - "name": "Operations" + "name": "security" }, { - "name": "security" + "name": "reliability" }, { - "name": "Performance" + "name": "operations" }, { - "name": "Reliability" + "name": "Operations" }, { - "name": "operations" + "name": "Performance" }, { - "name": "Security" + "name": "cost" }, { - "name": "performance" + "name": "Cost" }, { - "name": "cost" + "name": "performance" } ], "yesno": [ @@ -269,6 +381,6 @@ "name": "Azure Expressroute Service Guide", "waf": "all", "state": "preview", - "timestamp": "October 20, 2024" + "timestamp": "May 04, 2025" } } \ No newline at end of file diff --git a/checklists-ext/azurefiles_sg_checklist.en.json b/checklists-ext/azurefiles_sg_checklist.en.json index 979d7c5a..27eb4003 100644 --- a/checklists-ext/azurefiles_sg_checklist.en.json +++ b/checklists-ext/azurefiles_sg_checklist.en.json @@ -20,10 +20,10 @@ { "waf": "Reliability", "service": "Azure Files", - "text": "As a part of your backup and recovery strategy, enable\u202fsoft delete\u202fand\u202fuse snapshots for point-in-time restore. You can use Azure Backup to back up your SMB file shares. You can also use Azure File Sync to back up on-premises SMB file shares to an Azure file share. Azure Backup also allows you to do a vaulted backup (preview) of Azure Files to protect your data from ransomware attacks or source data loss due to a malicious actor or rogue admin. By using vaulted backup, Azure Backup copies and stores data in the Recovery Services vault. This creates an offsite copy of data that you can retain for up to 99 years. Azure Backup creates and manages the recovery points as per the schedule and retention defined in the backup policy. Learn more.", + "text": "As a part of your backup and recovery strategy, enable\u202fsoft delete\u202fand\u202fuse snapshots for point-in-time restore. You can use Azure Backup to back up your SMB file shares. You can also use Azure File Sync to back up on-premises SMB file shares to an Azure file share. Azure Backup also allows you to do a vaulted backup of Azure Files to protect your data from ransomware attacks or source data loss due to a malicious actor or rogue admin. By using vaulted backup, Azure Backup copies and stores data in the Recovery Services vault. This creates an offsite copy of data that you can retain for up to 99 years. Azure Backup creates and manages the recovery points as per the schedule and retention defined in the backup policy. Learn more.", "description": "Soft delete works on a file share level to protect Azure file shares against accidental deletion. Point-in-time restore protects against accidental deletion or corruption because you can restore file shares to an earlier state. For more information, see Data protection overview.", "type": "recommendation", - "guid": "8f7b75e8-8b14-44ce-b4a5-3e994995479d" + "guid": "5d398241-7347-47f9-9ae0-037829a8316a" }, { "waf": "Security", @@ -237,34 +237,34 @@ "categories": [], "waf": [ { - "name": "reliability" + "name": "Security" }, { - "name": "Cost" + "name": "Reliability" }, { - "name": "Operations" + "name": "security" }, { - "name": "security" + "name": "reliability" }, { - "name": "Performance" + "name": "operations" }, { - "name": "Reliability" + "name": "Operations" }, { - "name": "operations" + "name": "Performance" }, { - "name": "Security" + "name": "cost" }, { - "name": "performance" + "name": "Cost" }, { - "name": "cost" + "name": "performance" } ], "yesno": [ @@ -301,6 +301,6 @@ "name": "Azure Files Service Guide", "waf": "all", "state": "preview", - "timestamp": "October 20, 2024" + "timestamp": "May 04, 2025" } } \ No newline at end of file diff --git a/checklists-ext/azurefirewall_sg_checklist.en.json b/checklists-ext/azurefirewall_sg_checklist.en.json index 5bcd6293..96f57c14 100644 --- a/checklists-ext/azurefirewall_sg_checklist.en.json +++ b/checklists-ext/azurefirewall_sg_checklist.en.json @@ -245,34 +245,34 @@ "categories": [], "waf": [ { - "name": "reliability" + "name": "Security" }, { - "name": "Cost" + "name": "Reliability" }, { - "name": "Operations" + "name": "security" }, { - "name": "security" + "name": "reliability" }, { - "name": "Performance" + "name": "operations" }, { - "name": "Reliability" + "name": "Operations" }, { - "name": "operations" + "name": "Performance" }, { - "name": "Security" + "name": "cost" }, { - "name": "performance" + "name": "Cost" }, { - "name": "cost" + "name": "performance" } ], "yesno": [ @@ -309,6 +309,6 @@ "name": "Azure Firewall Service Guide", "waf": "all", "state": "preview", - "timestamp": "October 20, 2024" + "timestamp": "May 04, 2025" } } \ No newline at end of file diff --git a/checklists-ext/azurefrontdoor_sg_checklist.en.json b/checklists-ext/azurefrontdoor_sg_checklist.en.json index e00c8dc1..31d4ef72 100644 --- a/checklists-ext/azurefrontdoor_sg_checklist.en.json +++ b/checklists-ext/azurefrontdoor_sg_checklist.en.json @@ -4,48 +4,48 @@ { "waf": "Reliability", "service": "Azure Front Door", - "text": "Choose a routing method that supports your deployment strategy. The weighted method, which distributes traffic based on the configured weight coefficient, supports active-active models. A priority-based value that configures the primary region to receive all traffic and send traffic to the secondary region as a backup supports active-passive models. Combine the preceding methods with latency so that the origin with the lowest latency receives traffic.", + "text": "Choose a routing method that supports your deployment strategy. The weighted method, which distributes traffic based on the configured weight coefficient, supports active-active models. A priority-based value that configures the primary region to receive all traffic and send traffic to the secondary region as a backup supports active-passive models. Combine the preceding methods with latency sensitivity configurations so that the origin with the lowest latency receives traffic.", "description": "You can select the best origin resource by using a series of decision steps and your design. The selected origin serves traffic within the allowable latency range in the specified ratio of weights.", "type": "recommendation", - "guid": "00d75bec-38c3-456d-86e6-42031076f429" + "guid": "58826885-79f7-4229-9397-6b18197cdf56" }, { "waf": "Reliability", "service": "Azure Front Door", - "text": "Support redundancy by having multiple origins in one or more back-end pools. Always have redundant instances of your application and make sure each instance exposes an endpoint or origin. You can place those origins in one or more back-end pools.", - "description": "Multiple origins support redundancy by distributing traffic across multiple instances of the application. If one instance is unavailable, then other back-end origins can still receive traffic.", + "text": "Support redundancy by having multiple origins in one or more origin groups. Always have redundant instances of your application and make sure each instance exposes an origin. You can place those origins in one or more origin groups.", + "description": "Multiple origins support redundancy by distributing traffic across multiple instances of the application. If one instance is unavailable, then other origins can still receive traffic.", "type": "recommendation", - "guid": "96530f4b-ba7a-4288-aaa0-9167975de796" + "guid": "4b3fd76f-d131-4051-a3c4-9ffd243f175f" }, { "waf": "Reliability", "service": "Azure Front Door", - "text": "Set up health probes on the origin. Configure Azure Front Door to conduct health checks to determine if the back-end instance is available and ready to continue receiving requests.", - "description": "Enabled health probes are part of the health monitoring pattern implementation. Health probes make sure that Azure Front Door only routes traffic to instances that are healthy enough to handle requests. For more information, see Best practices on health probes.", + "text": "Set up health probes on the origin. Configure Azure Front Door to conduct health checks to determine if the origin instance is available and ready to continue receiving requests. For more information, see Best practices on health probes.", + "description": "Enabled health probes are part of the health monitoring pattern implementation. Health probes make sure that Azure Front Door only routes traffic to instances that are healthy enough to handle requests.", "type": "recommendation", - "guid": "f802e2c6-ea9b-4e0d-b3d1-b7c4d0b4aa26" + "guid": "09a0cdb5-433c-4315-b813-e1cb7ba3a40e" }, { "waf": "Reliability", "service": "Azure Front Door", - "text": "Set a timeout on forwarding requests to the back end. Adjust the timeout setting according to your endpoints' needs. If you don't, Azure Front Door might close the connection before the origin sends the response. You can also lower the default timeout for Azure Front Door if all of your origins have a shorter timeout. For more information, see Troubleshooting unresponsive requests.", - "description": "Timeouts help prevent performance issues and availability issues by terminating requests that take longer than expected to complete.", + "text": "Set a timeout on forwarding requests to the origin, and avoid long-running requests. Adjust the timeout setting according to your endpoints' needs. If you don't, Azure Front Door might close the connection before the origin sends the response. You can also lower the default timeout for Azure Front Door if all of your origins have a shorter timeout. For more information, see Troubleshooting unresponsive requests.", + "description": "Long-running requests consume system resources. Timeouts help prevent performance issues and availability issues by terminating requests that take longer than expected to complete.", "type": "recommendation", - "guid": "e1ec439d-2bc8-4e6e-b72f-1aafaa2aae58" + "guid": "04c221f2-51bc-4139-9e2f-29fe0b4100fc" }, { "waf": "Reliability", "service": "Azure Front Door", - "text": "Use the same host name on Azure Front Door and your origin. Azure Front Door can rewrite the host header of incoming requests, which is useful when you have multiple custom domain names that route to one origin. However, rewriting the host header might cause issues with request cookies and URL redirection.", - "description": "Set the same host name to prevent malfunction with session affinity, authentication, and authorization. For more information, see Preserve the original HTTP host name between a reverse proxy and its back-end web application.", + "text": "Use the same host name on Azure Front Door and your origin. Azure Front Door can rewrite the host header of incoming requests, which is useful when you have multiple custom domain names that route to one origin. However, rewriting the host header might cause issues with request cookies and URL redirection. For more information, see Preserve the original HTTP host name.", + "description": "Set the same host name to prevent malfunction with session affinity, authentication, and authorization.", "type": "recommendation", - "guid": "dd9e850c-0ce5-4ac2-8d8b-53cd21925a8d" + "guid": "2efcb410-5f34-4bc0-b76f-54f0f5ef6d15" }, { "waf": "Reliability", "service": "Azure Front Door", "text": "Decide if your application requires session affinity. If you have high reliability requirements, we recommend that you disable session affinity.", - "description": "With session affinity, user connections stay on the same origin during the user session. If that origin becomes unavailable, the user experience might be disrupted.", + "description": "With session affinity, user connections stay on the same origin during the user session. In some situations, a single origin might become overloaded with requests while other origins are idle. If that origin becomes unavailable, the user experience might be disrupted.", "type": "recommendation", "guid": "66bc38be-1613-4bf5-b10b-f130eaff5140" }, @@ -73,11 +73,27 @@ "type": "recommendation", "guid": "214d73c2-77e5-4000-81fb-7463d9183beb" }, + { + "waf": "Security", + "service": "Azure Front Door", + "text": "Send the host header to the origin.", + "description": "The back-end services should be aware of the host name so that they can create rules to accept traffic only from that host.", + "type": "recommendation", + "guid": "c2f472b7-6b70-417d-8e3e-775434a0d6f3" + }, + { + "waf": "Security", + "service": "Azure Front Door", + "text": "Secure the connections from Azure Front Door to your origins. Enable Private Link connectivity to supported origins. If your origin doesn't support Private Link connectivity, use service tags and the `X-Azure-FDID` header to verify the source of the request is your Azure Front Door profile.", + "description": "Ensure that all traffic flows through Azure Front Door, and gets the security benefits such as DDoS protection and WAF inspection.", + "type": "recommendation", + "guid": "89e99eb5-8a53-4304-87d7-b9966838b1f4" + }, { "waf": "Security", "service": "Azure Front Door", "text": "Enable end-to-end TLS, HTTP to HTTPS redirection, and managed TLS certificates when applicable. Review the TLS best practices for Azure Front Door. Use TLS version 1.2 as the minimum allowed version with ciphers that are relevant for your application. Azure Front Door managed certificates should be your default choice for ease of operations. However, if you want to manage the lifecycle of the certificates, use your own certificates in Azure Front Door custom domain endpoints and store them in Key Vault.", - "description": "TLS ensures that data exchanges between the browser, Azure Front Door, and the back-end origins are encrypted to prevent tampering. Key Vault offers managed certificate support and simple certificate renewal and rotation.", + "description": "TLS ensures that data exchanges between the browser, Azure Front Door, and the origins are encrypted to prevent tampering. Key Vault offers managed certificate support and simple certificate renewal and rotation.", "type": "recommendation", "guid": "a34600e2-cbae-4bf6-b272-377ec3232184" }, @@ -100,10 +116,10 @@ { "waf": "Cost", "service": "Azure Front Door", - "text": "Disable health checks in single back-end pools.If you have only one origin configured in your Azure Front Door origin group, these calls are unnecessary.", - "description": "You can save on bandwidth costs by disabling requests that aren't required to make routing decisions.", + "text": "Disable health checks in origin groups with a single origin.If you have only one origin configured in your Azure Front Door origin group, these calls are unnecessary.", + "description": "You can save on bandwidth costs by disabling health check requests that aren't required to make routing decisions.", "type": "recommendation", - "guid": "8d573a21-8a95-455f-9c6b-d3df3ef7b64f" + "guid": "e6848d72-c2b4-42eb-834c-c04a3ccdb403" }, { "waf": "Operations", @@ -149,7 +165,7 @@ "waf": "Performance", "service": "Azure Front Door", "text": "Enable caching. You can optimize query strings for caching. For purely static content, ignore query strings to maximize your use of the cache. If your application uses query strings, consider including them in the cache key. Including the query strings in the cache key allows Azure Front Door to serve cached responses or other responses, based on your configuration.", - "description": "Azure Front Door offers a robust content delivery network solution that caches content at the edge of the network. Caching reduces the load on the back-end servers and reduces data movement across the network, which helps offload bandwidth usage.", + "description": "Azure Front Door offers a robust content delivery network solution that caches content at the edge of the network. Caching reduces the load on the origin servers and reduces data movement across the network, which helps offload bandwidth usage.", "type": "recommendation", "guid": "e8d431d1-8549-4d6d-a1da-ec67e7ef897e" }, @@ -172,43 +188,43 @@ { "waf": "Performance", "service": "Azure Front Door", - "text": "Evaluate whether you should enable session affinity when requests from the same user should be directed to the same back-end server. From a reliability perspective, we don't recommend this approach. If you use this option, the application should gracefully recover without disrupting user sessions. There's also a tradeoff on load balancing because it restricts the flexibility of distributing traffic across multiple back ends evenly.", + "text": "Evaluate whether you should enable session affinity when requests from the same user should be directed to the same origin server. From a reliability perspective, we don't recommend this approach. If you use this option, the application should gracefully recover without disrupting user sessions. There's also a tradeoff on load balancing because it restricts the flexibility of distributing traffic across multiple origins evenly.", "description": "Optimize performance and maintain continuity for user sessions, especially when applications rely on maintaining state information locally.", "type": "recommendation", - "guid": "7dd5da03-4e98-440f-9bfa-a60428db96a0" + "guid": "4f505952-dbc5-4aba-b6e5-97f5cb72b74e" } ], "categories": [], "waf": [ { - "name": "reliability" + "name": "Security" }, { - "name": "Cost" + "name": "Reliability" }, { - "name": "Operations" + "name": "security" }, { - "name": "security" + "name": "reliability" }, { - "name": "Performance" + "name": "operations" }, { - "name": "Reliability" + "name": "Operations" }, { - "name": "operations" + "name": "Performance" }, { - "name": "Security" + "name": "cost" }, { - "name": "performance" + "name": "Cost" }, { - "name": "cost" + "name": "performance" } ], "yesno": [ @@ -245,6 +261,6 @@ "name": "Azure Front Door Service Guide", "waf": "all", "state": "preview", - "timestamp": "October 20, 2024" + "timestamp": "May 04, 2025" } } \ No newline at end of file diff --git a/checklists-ext/azurekubernetesservice_sg_checklist.en.json b/checklists-ext/azurekubernetesservice_sg_checklist.en.json index 10fd2600..b4c30b7c 100644 --- a/checklists-ext/azurekubernetesservice_sg_checklist.en.json +++ b/checklists-ext/azurekubernetesservice_sg_checklist.en.json @@ -4,403 +4,315 @@ { "waf": "Reliability", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Control pod scheduling using node selectors and affinity.", - "description": "Allows the Kubernetes scheduler to logically isolate workloads by hardware in the node. Unlike tolerations, pods without a matching node selector can be scheduled on labeled nodes, which allows unused resources on the nodes to consume, but gives priority to pods that define the matching node selector. Use node affinity for more flexibility, which allows you to define what happens if the pod can't be matched with a node.", + "text": "(Cluster and workload) Control pod scheduling by using node selectors and affinity. In AKS, the Kubernetes scheduler can logically isolate workloads by hardware in the node. Unlike tolerations, pods that don't have a matching node selector can be scheduled on labeled nodes, but priority is given to pods that define the matching node selector.", + "description": "Node affinity results in more flexibility, which allows you to define what happens if the pod can't be matched with a node.", "type": "recommendation", - "guid": "79d7a487-b319-40ef-b55f-94c265986865" + "guid": "36922d8c-98fb-4d4f-acb6-fa1a9b9d5227" }, { "waf": "Reliability", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Ensure proper selection of network plugin based on network requirements and cluster sizing.", - "description": "Azure CNI is required for specific scenarios, for example, Windows-based node pools, specific networking requirements and Kubernetes Network Policies. Reference Kubenet versus Azure CNI for more information.", + "text": "(Cluster) Choose the appropriate network plugin based on network requirements and cluster sizing. Different network plugins provide varying levels of functionality. Azure Container Networking Interface (Azure CNI) is required for specific scenarios, such as Windows-based node pools, some networking requirements, and Kubernetes network policies. For more information, see Kubenet versus Azure CNI.", + "description": "The right network plugin can help ensure better compatibility and performance.", "type": "recommendation", - "guid": "d152a74d-cb2f-4c9c-978e-0e3a07b174a5" + "guid": "0d4d75ea-c666-4b13-af6d-10e7e4546da1" }, { "waf": "Reliability", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Use the AKS Uptime SLA for production grade clusters.", - "description": "The AKS Uptime SLA guarantees: - `99.95%` availability of the Kubernetes API server endpoint for AKS Clusters that use Azure Availability Zones, or - `99.9%` availability for AKS Clusters that don't use Azure Availability Zones.", + "text": "(Cluster and workload) Use the AKS uptime SLA for production-grade clusters.", + "description": "The workload can support higher availability targets because of the higher availability guarantees of the Kubernetes API server endpoint for AKS clusters.", "type": "recommendation", - "guid": "99cc0fc7-138a-452f-a4dd-fd79daa5049e" + "guid": "7cb635c3-4105-4eea-9c02-b4cc26dcc450" }, { "waf": "Reliability", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Use availability zones to maximize resilience within an Azure region by distributing AKS agent nodes across physically separate data centers.", - "description": "By spreading node pools across multiple zones, nodes in one node pool will continue running even if another zone has gone down. If colocality requirements exist, either a regular VMSS-based AKS deployment into a single zone or proximity placement groups can be used to minimize internode latency.", + "text": "(Cluster) Use availability zones to maximize resilience within an Azure region by distributing AKS agent nodes across physically separate datacenters.If colocality requirements exist, use a regular virtual machine scale sets-based AKS deployment into a single zone or use proximity placement groups to minimize internode latency.", + "description": "By spreading node pools across multiple zones, nodes in one node pool continue to run even if another zone goes down.", "type": "recommendation", - "guid": "29400c1f-e4ff-4ab5-89ff-be93d51d5fa8" + "guid": "0dabe408-2db6-45ca-96d7-a1cc9c9fc35f" }, { "waf": "Reliability", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Adopt a multiregion strategy by deploying AKS clusters deployed across different Azure regions to maximize availability and provide business continuity.", - "description": "Internet facing workloads should leverage Azure Front Door or Azure Traffic Manager to route traffic globally across AKS clusters.", + "text": "(Cluster and workload) Define pod resource requests and limits in application deployment manifests. Enforce those limits by using Azure Policy.", + "description": "Container CPU and memory resource limits are necessary to prevent resource exhaustion in your Kubernetes cluster.", "type": "recommendation", - "guid": "72c0719b-444b-49fa-b3fb-598de4c4a99d" + "guid": "5bbc78c1-3240-42b2-bce0-e65fc1b8b975" }, { "waf": "Reliability", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Define Pod resource requests and limits in application deployment manifests, and enforce with Azure Policy.", - "description": "Container CPU and memory resource limits are necessary to prevent resource exhaustion in your Kubernetes cluster.", + "text": "(Cluster and workload) Keep the system node pool isolated from application workloads.System node pools require a virtual machine (VM) SKU of at least 2 vCPUs and 4 GB of memory. We recommend that you use 4 vCPU or more. For more information, see System and user node pools.", + "description": "The system node pool hosts critical system pods that are essential for the control plane of your cluster. By isolating these system pods from application workloads, you help ensure that the essential services are unaffected by the resource demands or potential problems caused by a workload.", "type": "recommendation", - "guid": "f230e754-6818-4428-b2ad-7ad543723f80" + "guid": "08830d0e-bf7f-4704-ba1e-93727658788a" }, { "waf": "Reliability", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Keep the System node pool isolated from application workloads.", - "description": "System node pools require a VM SKU of at least 2 vCPUs and 4 GB memory, but 4 vCPU or more is recommended. Reference System and user node pools for detailed requirements.", + "text": "(Cluster and workload) Separate applications to dedicated node pools based on specific requirements. Avoid large numbers of node pools to reduce management overhead.", + "description": "Applications can share the same configuration and need GPU-enabled VMs, CPU or memory-optimized VMs, or the ability to scale to zero. By dedicating node pools to specific applications, you can help ensure that each application gets the resources it needs without overprovisioning or underutilizing resources.", "type": "recommendation", - "guid": "f51b6cdd-3914-4e11-a8e8-f2cf61788e84" + "guid": "aa2d15d3-5600-4a7f-af98-6187b49aeb8d" }, { "waf": "Reliability", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Separate applications to dedicated node pools based on specific requirements.", - "description": "Applications may share the same configuration and need GPU-enabled VMs, CPU or memory optimized VMs, or the ability to scale-to-zero. Avoid large number of node pools to reduce extra management overhead.", + "text": "(Cluster) Use a NAT gateway for clusters that run workloads that make many concurrent outbound connections.", + "description": "Azure NAT Gateway supports reliable egress traffic at scale and helps you avoid reliability problems by applying Azure Load Balancer limitations to high concurrent outbound traffic.", "type": "recommendation", - "guid": "e303e4a0-0b81-47bd-bd44-bede44825fa2" + "guid": "436a18a9-d302-4008-baa8-a415083d5759" }, { "waf": "Reliability", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Use a NAT gateway for clusters that run workloads that make many concurrent outbound connections.", - "description": "To avoid reliability issues with Azure Load Balancer limitations with high concurrent outbound traffic, us a NAT Gateway instead to support reliable egress traffic at scale.", + "text": "(Cluster and workload) Use Azure Backup to protect AKS cluster and restore to alternate regions during disaster. Azure Backup supports the backup and restore operations of containerized applications and data running for both cluster state and application data. You can use the backups in a regional disaster scenario and recover backups.", + "description": "Azure Backup with Azure Kubernetes Service (AKS) offers a fully managed, scalable, secure, and cost-effective solution. Enhances the reliability of the workload without the complexities of setting up and maintaining backup infrastructure.", "type": "recommendation", - "guid": "ff84fa04-ee66-4605-85f3-803dd2cab3c4" + "guid": "03f66a10-1d35-4ed0-89b4-ed007b7bb2cf" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Use Microsoft Entra integration.", - "description": "Using Microsoft Entra ID centralizes the identity management component. Any change in user account or group status is automatically updated in access to the AKS cluster. The developers and application owners of your Kubernetes cluster need access to different resources.", + "text": "(Cluster) Use managed identities on the cluster.", + "description": "You can avoid the overhead associated with managing and rotating service principles.", "type": "recommendation", - "guid": "03a24546-1e4a-46d6-ad0c-63e80d157b51" + "guid": "3b085d30-3fa8-4497-a6d2-afc260a981a4" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Authenticate with Microsoft Entra ID to Azure Container Registry.", - "description": "AKS and Microsoft Entra ID enables authentication with Azure Container Registry without the use of `imagePullSecrets` secrets. Review Authenticate with Azure Container Registry from Azure Kubernetes Service for more information.", + "text": "(Workload) Use Microsoft Entra Workload ID with AKS to access Microsoft Entra protected resources, such as Azure Key Vault and Microsoft Graph, from your workload.", + "description": "Use AKS Workload IDs to protect access to Azure resources by using Microsoft Entra ID RBAC without having to manage credentials directly in your code.", "type": "recommendation", - "guid": "dc50fb2f-f5e9-4ea2-a1b8-36fbf1e1bec4" + "guid": "f806207f-aed4-45e7-864f-6c7f1cb56c96" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Secure network traffic to your API server with private AKS cluster.", - "description": "By default, network traffic between your node pools and the API server travels the Microsoft backbone network; by using a private cluster, you can ensure network traffic to your API server remains on the private network only.", + "text": "(Cluster) Use Microsoft Entra ID to authenticate with Azure Container Registry from AKS.", + "description": "By using Microsoft Entra ID, AKS can authenticate with Container Registry without the use of `imagePullSecrets` secrets.", "type": "recommendation", - "guid": "3d11d647-363f-469c-bea9-9ef27a9bee70" + "guid": "f62f5eac-109e-4cf2-857b-3c3c2bae124c" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: For non-private AKS clusters, use API server authorized IP ranges.", - "description": "When using public clusters, you can still limit the traffic that can reach your clusters API server by using the authorized IP range feature. Include sources like the public IPs of your deployment build agents, operations management, and node pools' egress point (such as Azure Firewall).", + "text": "(Cluster) Secure network traffic to your API server by using private AKS cluster if the workload requirements require higher levels of segmentation.", + "description": "By default, network traffic between your node pools and the API server travels the Microsoft backbone network. By using a private cluster, you can help ensure that network traffic to your API server remains on the private network only.", "type": "recommendation", - "guid": "0418c965-c483-41bc-933c-9289108b8ad0" + "guid": "5135ede1-7afd-415c-ad5e-89449b8124b7" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Protect the API server with Microsoft Entra RBAC.", - "description": "Securing access to the Kubernetes API Server is one of the most important things you can do to secure your cluster. Integrate Kubernetes role-based access control (RBAC) with Microsoft Entra ID to control access to the API server. Disable local accounts to enforce all cluster access using Microsoft Entra ID-based identities.", + "text": "(Cluster) For public AKS clusters, use API server-authorized IP address ranges. Include sources like the public IP addresses of your deployment build agents, operations management, and node pools' egress point, such as Azure Firewall.", + "description": "When you use public clusters, you can significantly reduce the attack surface of your AKS cluster by limiting the traffic that can reach the API server of your clusters.", "type": "recommendation", - "guid": "951727b0-fd7b-4d09-8ae6-384f8441c234" + "guid": "fc234dca-a17f-4df9-acde-0d7f28400315" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Use Azure network policies or Calico.", - "description": "Secure and control network traffic between pods in a cluster.", + "text": "(Cluster) Protect the API server by using Microsoft Entra ID RBAC.Disable local accounts to enforce all cluster access by using Microsoft Entra ID-based identities.", + "description": "Securing access to the Kubernetes API server is one of the most important things that you can do to secure your cluster. Integrate Kubernetes RBAC with Microsoft Entra ID to control access to the API server.", "type": "recommendation", - "guid": "68a66542-3c31-43c4-8a16-a05ab2a4df5f" + "guid": "ec5cb7dc-b2fd-4296-bfbf-d2ad9bb69af8" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Secure clusters and pods with Azure Policy.", - "description": "Azure Policy can help to apply at-scale enforcement and safeguards on your clusters in a centralized, consistent manner. It can also control what functions pods are granted and if anything is running against company policy.", + "text": "(Cluster) Use Azure network policies or Calico.", + "description": "By using policies, you can secure and control network traffic between pods in a cluster. Calico provides a richer set of capabilities, including policy ordering and priority, deny rules, and more flexible match rules.", "type": "recommendation", - "guid": "c99d155b-6a13-4f61-8672-356b1ed3a922" + "guid": "33664205-a916-424e-97af-ed9a7b5b75e0" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Secure container access to resources.", - "description": "Limit access to actions that containers can perform. Provide the least number of permissions, and avoid the use of root or privileged escalation.", + "text": "(Cluster) Secure clusters and pods by using Azure Policy.", + "description": "Azure Policy can help apply at-scale enforcement and safeguards on your clusters in a centralized, consistent manner. It can also control what functions pods are granted and detect whether anything is running against company policy.", "type": "recommendation", - "guid": "43384eb1-7b95-44aa-9c60-e09652b18521" + "guid": "9d6ed0e9-fee1-4c25-b5e8-03a4fea4f9bb" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Workload architecture: Use a Web Application Firewall to secure HTTP(S) traffic.", - "description": "To scan incoming traffic for potential attacks, use a web application firewall such as Azure Web Application Firewall (WAF) on Azure Application Gateway or Azure Front Door.", + "text": "(Cluster) Secure container access to resources. Limit access to actions that containers can perform. Provide the least number of permissions, and avoid the use of root or privileged escalation.For Linux based containers, see Security container access to resources using built-in Linux security features.", + "description": "By restricting permissions and avoiding the use of root or privileged escalation, you help reduce the risk of security breaches. You can help ensure that, even if a container is compromised, the potential damage is minimized.", "type": "recommendation", - "guid": "ae6a3199-847d-4901-bad1-63be20d584b8" + "guid": "b0a443c9-24db-4996-b8ad-80f130484c44" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Control cluster egress traffic.", - "description": "Ensure your cluster's outbound traffic is passing through a network security point such as Azure Firewall or an HTTP proxy.", + "text": "(Cluster) Control cluster egress traffic by ensuring that your cluster's outbound traffic passes through a network security point such as Azure Firewall or an HTTP proxy.", + "description": "By routing outbound traffic through Azure Firewall or an HTTP proxy, you can help enforce security policies that prevent unauthorized access and data exfiltration. This approach also simplifies the administration of security policies and makes it easier to enforce consistent rules across your entire AKS cluster.", "type": "recommendation", - "guid": "d67db716-6569-4a40-b18f-01052c02399f" + "guid": "5a170011-093b-4922-b976-b130f425e510" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Use the open-source Microsoft Entra Workload ID and Secrets Store CSI Driver with Azure Key Vault.", - "description": "Protect and rotate secrets, certificates, and connection strings in Azure Key Vault with strong encryption. Provides an access audit log, and keeps core secrets out of the deployment pipeline.", + "text": "(Cluster) Use the open-source Microsoft Entra Workload ID and Secrets Store CSI Driver with Key Vault.", + "description": "These features help you protect and rotate secrets, certificates, and connection strings in Key Vault by using strong encryption. They provide an access audit log and keep core secrets out of the deployment pipeline.", "type": "recommendation", - "guid": "bc5240f0-9c6d-469e-bf71-9f8d4bf18c29" + "guid": "b50182f9-9d24-4bcb-bae5-967dac38d9b2" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Use Microsoft Defender for Containers.", - "description": "Monitor and maintain the security of your clusters, containers, and their applications.", - "type": "recommendation", - "guid": "2ac2d30a-a96a-4d14-b63d-7e2e3147ea9e" - }, - { - "waf": "Cost", - "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Align SKU selection and managed disk size with workload requirements.", - "description": "Matching your selection to your workload demands ensures you don't pay for unneeded resources.", - "type": "recommendation", - "guid": "93a3ae7a-6f04-471b-bb3f-5eb4e6664c5b" - }, - { - "waf": "Cost", - "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Select the right virtual machine instance type.", - "description": "Selecting the right virtual machine instance type is critical as it directly impacts the cost of running applications on AKS. Choosing a high-performance instance without proper utilization can lead to wasteful spending, while choosing a less powerful instance can lead to performance issues and increased downtime. To determine the right virtual machine instance type, consider workload characteristics, resource requirements, and availability needs.", - "type": "recommendation", - "guid": "f16a2684-c225-4fd5-8202-8ea1db7f18e7" - }, - { - "waf": "Cost", - "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Select virtual machines based on the Arm architecture.", - "description": "AKS supports creating ARM64 Ubuntu agent nodes, as well as a of mix Intel and ARM architecture nodes within a cluster that can bring better performance at a lower cost.", + "text": "(Cluster) Use Microsoft Defender for Containers.", + "description": "Microsoft Defender for Containers helps you monitor and maintain the security of your clusters, containers, and their applications.", "type": "recommendation", - "guid": "f0572fdc-24e3-4e6c-8c90-0ba85cc6f52a" + "guid": "6746bf44-02b9-4fb5-99af-0b48e46fae88" }, { "waf": "Cost", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Select Azure Spot Virtual Machines.", - "description": "Spot VMs allow you to take advantage of unutilized Azure capacity with significant discounts (up to 90% as compared to pay-as-you-go prices). If Azure needs capacity back, the Azure infrastructure evicts the Spot nodes.", + "text": "(Cluster and workload) Align AKS SKU selection and managed disk size with workload requirements.", + "description": "Matching your selection to your workload demands helps ensure that you don't pay for unneeded resources.", "type": "recommendation", - "guid": "15372689-94ad-4247-a157-fde418217ce4" + "guid": "55b066b5-6563-4172-a95e-d9e8b0fe6b18" }, { "waf": "Cost", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Select the appropriate region.", - "description": "Due to many factors, cost of resources varies per region in Azure. Evaluate the cost, latency, and compliance requirements to ensure you are running your workload cost-effectively and it doesn't affect your end-users or create extra networking charges.", + "text": "(Cluster) Choose the right VM instance types for your AKS node pools.To determine the right VM instance types, consider workload characteristics, resource requirements, and availability needs.", + "description": "Selecting the right VM instance type is crucial because it directly affects the cost to run applications on AKS. Choosing a high-performance instance without proper utilization can lead to wasteful spending. Choosing a less powerful instance can lead to performance problems and increased downtime.", "type": "recommendation", - "guid": "41a6e1ef-d63a-4e26-842f-1a0cde3abaa6" + "guid": "8cbe56ba-3474-4be8-b5a3-9b06fab779a1" }, { "waf": "Cost", "service": "Azure Kubernetes Service", - "text": "Workload architecture: Maintain small and optimized images.", - "description": "Streamlining your images helps reduce costs since new nodes need to download these images. Build images in a way that allows the container start as soon as possible to help avoid user request failures or timeouts while the application is starting up, potentially leading to overprovisioning.", + "text": "(Cluster) Choose VMs based on the more power efficient Azure Resource Manager architecture. AKS supports creating Arm64 node pools and a mix of Intel and Resource Manager architecture nodes within a cluster.", + "description": "The Arm64 architecture provides a better price-to-performance ratio because of its lower power utilization and efficient compute performance. These capabilities can bring better performance at a lower cost.", "type": "recommendation", - "guid": "e689c267-1eaa-444d-806e-c5a0735658de" + "guid": "b2e71c48-cbaf-4d99-8c17-a6e35c2ea8b0" }, { "waf": "Cost", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Enable Cluster Autoscaler to automatically reduce the number of agent nodes in response to excess resource capacity.", - "description": "Automatically scaling down the number of nodes in your AKS cluster lets you run an efficient cluster when demand is low and scale up when demand returns.", + "text": "(Cluster) Enable the cluster autoscaler to automatically reduce the number of agent nodes in response to excess resource capacity.", + "description": "Automatically scaling down the number of nodes in your AKS cluster lets you run an efficient cluster when demand is low and scale up when demand increases.", "type": "recommendation", - "guid": "e770ea75-690d-4dc0-a0e4-43c17b6bb56f" + "guid": "7f176926-0efa-4cf6-a47c-bd4ddfc73230" }, { "waf": "Cost", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Enable Node Autoprovision to automate VM SKU selection.", - "description": "Node Autoprovision simplifies the SKU selection process and decides, based on pending pod resource requirements, the optimal VM configuration to run workloads in the most efficient and cost effective manner.", + "text": "(Cluster) Enable node autoprovisioning to automate VM SKU selection.", + "description": "Node autoprovision simplifies the SKU selection process and decides, based on pending pod resource requirements, the optimal VM configuration to run workloads in the most efficient and cost-effective manner.", "type": "recommendation", - "guid": "9994ccff-d3d4-46e3-92cb-d04231859e20" + "guid": "d086a384-e740-484b-ba79-b47b2caf1ac4" }, { "waf": "Cost", "service": "Azure Kubernetes Service", - "text": "Workload architecture: Use the Horizontal Pod Autoscaler.", - "description": "Adjust the number of pods in a deployment depending on CPU utilization or other select metrics, which support cluster scale-in operations.", + "text": "(Workload) Use HorizontalPodAutoscaler to adjust the number of pods in a deployment depending on CPU utilization or other metrics.", + "description": "Automatically scaling down the number of pods when demand is low and scaling out when demand increases results in a more cost-effective operation of your workload.", "type": "recommendation", - "guid": "1321c176-e53a-4451-91cf-e4e50c637d07" + "guid": "08155215-83f1-4188-8e71-980bd7a9e529" }, { "waf": "Cost", "service": "Azure Kubernetes Service", - "text": "Workload architecture: Use Vertical Pod Autoscaler (preview).", - "description": "Rightsize your pods and dynamically set requests and limits based on historic usage.", + "text": "(Workload) Use VerticalPodAutoscaler (preview) to rightsize your pods and dynamically set requests and limits based on historic usage.", + "description": "By setting resource requests and limits on containers for each workload, VerticalPodAutoscaler frees up CPU and memory for other pods and helps ensure effective utilization of your AKS clusters.", "type": "recommendation", - "guid": "554e18fe-b835-4e78-8d27-6f49999855ad" + "guid": "98c8e98e-5f2f-4497-9ff2-da47047fe9ff" }, { "waf": "Cost", "service": "Azure Kubernetes Service", - "text": "Workload architecture: Use Kubernetes Event Driven Autoscaling (KEDA).", - "description": "Scale based on the number of events being processed. Choose from a rich catalogue of 50+ KEDA scalers.", + "text": "(Cluster) Configure the AKS cost analysis add-on.", + "description": "The cost analysis cluster extension enables you to obtain granular insight into costs that are associated with various Kubernetes resources in your clusters or namespaces.", "type": "recommendation", - "guid": "fe15f362-a48d-4a25-aa3e-7938b3d1f5e2" - }, - { - "waf": "Cost", - "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Adopt a cloud financial discipline and cultural practice to drive ownership of cloud usage.", - "description": "The foundation of enabling cost optimization is the spread of a cost saving cluster. A financial operations approach (FinOps) is often used to help organizations reduce cloud costs. It is a practice involving collaboration between finance, operations, and engineering teams to drive alignment on cost saving goals and bring transparency to cloud costs.", - "type": "recommendation", - "guid": "d0ed8b85-2072-4952-a00c-697135e435a2" - }, - { - "waf": "Cost", - "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Sign up for Azure Reservations or Azure Savings Plan.", - "description": "If you properly planned for capacity, your workload is predictable and exists for an extended period of time, sign up for an Azure Reservation or a savings plan to further reduce your resource costs.", - "type": "recommendation", - "guid": "9d4b8d27-c793-4862-94c8-c2505205f07f" - }, - { - "waf": "Cost", - "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Configure the AKS Cost Analysis add-on.", - "description": "The cost analysis cluster extension enables you to obtain granular insight into costs associated with various Kubernetes resources in your clusters or namespaces.", - "type": "recommendation", - "guid": "43cae555-15bd-4cd7-90b9-905d40528506" + "guid": "197b8613-a0ff-4f70-9e50-e7a866e312f3" }, { "waf": "Operations", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Review AKS best practices documentation.", - "description": "To build and run applications successfully in AKS, there are key considerations to understand and implement. These areas include multi-tenancy and scheduler features, cluster, and pod security, or business continuity and disaster recovery.", + "text": "(Cluster) Operationalize cluster and pod configuration standards by using Azure policies for AKS.", + "description": "Azure policies for AKS can help you apply at-scale enforcement and safeguards on your clusters in a centralized, consistent manner. Use policies to define the permissions granted to pods and ensure compliance with company policies.", "type": "recommendation", - "guid": "4ea51e35-26bb-4a2b-a7ca-435eb6d857b9" + "guid": "e96cf35a-d410-49da-80b6-e63b205cefe8" }, { "waf": "Operations", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Review Azure Chaos Studio.", - "description": "Azure Chaos Studio can help simulate faults and trigger disaster recovery situations.", + "text": "(Workload) Use Kubernetes Event Driven Autoscaler (KEDA).", + "description": "KEDA allows your applications to scale based on events, like the number of events being processed. You can choose from a rich catalog of more than 50 KEDA scalers.", "type": "recommendation", - "guid": "d38b720a-0b27-409f-9dfa-28f698067a19" - }, - { - "waf": "Operations", - "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Adopt a multiregion strategy by deploying AKS clusters deployed across different Azure regions to maximize availability and provide business continuity.", - "description": "Internet facing workloads should leverage Azure Front Door or Azure Traffic Manager to route traffic globally across AKS clusters.", - "type": "recommendation", - "guid": "72c0719b-444b-49fa-b3fb-598de4c4a99d" - }, - { - "waf": "Operations", - "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Operationalize clusters and pods configuration standards with Azure Policy.", - "description": "Azure Policy can help to apply at-scale enforcement and safeguards on your clusters in a centralized, consistent manner. It can also control what functions pods are granted and if anything is running against company policy.", - "type": "recommendation", - "guid": "4fa55d17-b08b-4885-8d5b-d8cfbe3eda59" - }, - { - "waf": "Operations", - "service": "Azure Kubernetes Service", - "text": "Workload architecture: Use platform capabilities in your release engineering process.", - "description": "Kubernetes and ingress controllers support many advanced deployment patterns for inclusion in your release engineering process. Consider patterns like blue-green deployments or canary releases.", - "type": "recommendation", - "guid": "40bf5a02-ca6a-410d-9b6a-87fe75ff0873" - }, - { - "waf": "Operations", - "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: For mission-critical workloads, use stamp-level blue/green deployments.", - "description": "Automate your mission-critical design areas, including deployment and testing.", - "type": "recommendation", - "guid": "dec76a46-6005-4c3e-ac2c-70b7214a28ba" + "guid": "09f7f5a9-2dfa-4924-8686-219c2c33c95e" }, { "waf": "Performance", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Develop a detailed capacity plan and continually review and revise.", - "description": "After formalizing your capacity plan, it should be frequently updated by continuously observing the resource utilization of the cluster.", + "text": "(Cluster) Enable cluster autoscaler to automatically adjust the number of agent nodes in response to workload demands.Use the HorizontalPodAutoscaler to adjust the number of pods in a deployment depending on CPU utilization or other metrics.", + "description": "The ability to automatically scale up or scale down the number of nodes and the number of pods in your AKS cluster lets you run an efficient, cost-effective cluster.", "type": "recommendation", - "guid": "9f2cef22-f674-4702-a57a-5dcf90bf9143" + "guid": "c5ac26e5-e72c-4222-9f78-844ab2723c7a" }, { "waf": "Performance", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Enable cluster autoscaler to automatically adjust the number of agent nodes in response to resource constraints.", - "description": "The ability to automatically scale up or down the number of nodes in your AKS cluster lets you run an efficient, cost-effective cluster.", + "text": "(Cluster and workload) Separate workloads into different node pools and consider scaling user node pools.", + "description": "Unlike system node pools that always require running nodes, user node pools allow you to scale up or scale down.", "type": "recommendation", - "guid": "d50d653b-0d8d-44ca-8ba6-9963321d1f24" + "guid": "e73fb529-0f8f-40cc-9724-5601f23110af" }, { "waf": "Performance", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Separate workloads into different node pools and consider scaling user node pools.", - "description": "Unlike System node pools that always require running nodes, user node pools allow you to scale up or down.", + "text": "(Workload) Use AKS advanced scheduler features to implement advanced balancing of resources for workloads that require them.", + "description": "As you manage AKS clusters, you often need to isolate teams and workloads. Advanced features that the Kubernetes scheduler provides let you control which pods can be scheduled on certain nodes. They also let you control how multipod applications can be appropriately distributed across the cluster.", "type": "recommendation", - "guid": "cd14e737-9478-4fb4-9556-586b03d5e248" + "guid": "04302838-ade8-4ed5-afbd-abca958fc1f7" }, { "waf": "Performance", "service": "Azure Kubernetes Service", - "text": "Workload architecture: Use AKS advanced scheduler features.", - "description": "Helps control balancing of resources for workloads that require them.", + "text": "(Workload) Use KEDA to build a meaningful autoscale ruleset based on signals that are specific to your workload.", + "description": "Not all scale decisions can be derived from CPU or memory metrics. Scale considerations often come from more complex or even external data points. KEDA allows your applications to scale based on events, such as the number of messages in a queue or the length of a topic lag.", "type": "recommendation", - "guid": "e452f566-2a10-4119-a802-aeab44201971" - }, - { - "waf": "Performance", - "service": "Azure Kubernetes Service", - "text": "Workload architecture: Use meaningful workload scaling metrics.", - "description": "Not all scale decisions can be derived from CPU or memory metrics. Often scale considerations will come from more complex or even external data points. Use KEDA to build a meaningful auto scale ruleset based on signals that are specific to your workload.", - "type": "recommendation", - "guid": "43df6223-dfca-447d-9b00-f4742ef18be1" + "guid": "ddc26835-9557-4bcb-9d55-b19df3db78db" } ], "categories": [], "waf": [ { - "name": "reliability" + "name": "Security" }, { - "name": "Cost" + "name": "Reliability" }, { - "name": "Operations" + "name": "security" }, { - "name": "security" + "name": "reliability" }, { - "name": "Performance" + "name": "operations" }, { - "name": "Reliability" + "name": "Operations" }, { - "name": "operations" + "name": "Performance" }, { - "name": "Security" + "name": "cost" }, { - "name": "performance" + "name": "Cost" }, { - "name": "cost" + "name": "performance" } ], "yesno": [ @@ -437,6 +349,6 @@ "name": "Azure Kubernetes Service Service Guide", "waf": "all", "state": "preview", - "timestamp": "October 20, 2024" + "timestamp": "May 04, 2025" } } \ No newline at end of file diff --git a/checklists-ext/azuremachinelearning_sg_checklist.en.json b/checklists-ext/azuremachinelearning_sg_checklist.en.json index d42e8c4b..3dc6f9c9 100644 --- a/checklists-ext/azuremachinelearning_sg_checklist.en.json +++ b/checklists-ext/azuremachinelearning_sg_checklist.en.json @@ -269,34 +269,34 @@ "categories": [], "waf": [ { - "name": "reliability" + "name": "Security" }, { - "name": "Cost" + "name": "Reliability" }, { - "name": "Operations" + "name": "security" }, { - "name": "security" + "name": "reliability" }, { - "name": "Performance" + "name": "operations" }, { - "name": "Reliability" + "name": "Operations" }, { - "name": "operations" + "name": "Performance" }, { - "name": "Security" + "name": "cost" }, { - "name": "performance" + "name": "Cost" }, { - "name": "cost" + "name": "performance" } ], "yesno": [ @@ -333,6 +333,6 @@ "name": "Azure Machine Learning Service Guide", "waf": "all", "state": "preview", - "timestamp": "October 20, 2024" + "timestamp": "May 04, 2025" } } \ No newline at end of file diff --git a/checklists-ext/azureopenai_sg_checklist.en.json b/checklists-ext/azureopenai_sg_checklist.en.json index e2b1032d..ea4e1c35 100644 --- a/checklists-ext/azureopenai_sg_checklist.en.json +++ b/checklists-ext/azureopenai_sg_checklist.en.json @@ -109,34 +109,34 @@ "categories": [], "waf": [ { - "name": "reliability" + "name": "Security" }, { - "name": "Cost" + "name": "Reliability" }, { - "name": "Operations" + "name": "security" }, { - "name": "security" + "name": "reliability" }, { - "name": "Performance" + "name": "operations" }, { - "name": "Reliability" + "name": "Operations" }, { - "name": "operations" + "name": "Performance" }, { - "name": "Security" + "name": "cost" }, { - "name": "performance" + "name": "Cost" }, { - "name": "cost" + "name": "performance" } ], "yesno": [ @@ -173,6 +173,6 @@ "name": "Azure Openai Service Guide", "waf": "all", "state": "preview", - "timestamp": "October 20, 2024" + "timestamp": "May 04, 2025" } } \ No newline at end of file diff --git a/checklists-ext/virtualmachines_sg_checklist.en.json b/checklists-ext/virtualmachines_sg_checklist.en.json index 4b3cd509..d926bb69 100644 --- a/checklists-ext/virtualmachines_sg_checklist.en.json +++ b/checklists-ext/virtualmachines_sg_checklist.en.json @@ -13,7 +13,7 @@ "waf": "Reliability", "service": "Virtual Machines", "text": "(VMs) Implement heath endpoints that emit instance health statuses on VMs. (Scale set) Enable automatic repairs on the scale set by specifying the preferred repair action. Consider setting a time frame during which automatic repairs pause if the VM's state changes.", - "description": "Maintain availability even if an instance is deemed unhealthy. Automatic repairs initiate recovery by replacing the faulty instance. Setting a time window can prevent inadvertent or premature repair operations.", + "description": "Maintain availability even if an instance is deemed unhealthy. Automatic repairs initiate recovery by replacing the faulty instance. Setting a time window can prevent inadvertent or premature repair operations.", "type": "recommendation", "guid": "fac87b55-eeaa-47d4-99fc-05bf5e220e3e" }, @@ -25,6 +25,14 @@ "type": "recommendation", "guid": "6f9864f6-415a-40eb-be9d-328d66a1a313" }, + { + "waf": "Reliability", + "service": "Virtual Machines", + "text": "(Scale set) Preallocate instances with standby pools.", + "description": "Standby pool instances remain dormant but are ready to take over workloads if a failure occurs. This capability enhances the system's reliability.", + "type": "recommendation", + "guid": "1a8d2764-0196-440d-88b6-07dd7dfd99fe" + }, { "waf": "Reliability", "service": "Virtual Machines", @@ -36,10 +44,18 @@ { "waf": "Reliability", "service": "Virtual Machines", - "text": "(Scale set) Deploy across availability zones on scale sets. Set up at least two instances in each zone. Zone balancing equally spreads the instances across zones.", + "text": "(Scale set) Deploy across availability zones on scale sets. Set up at least two instances in each zone. Zone balancing equally spreads the instances across zones.", "description": "The VM instances are provisioned in physically separate locations within each Azure region that are tolerant to local failures. Keep in mind that, depending on resource availability, there might be an uneven number of instances across zones. Zone balancing supports availability by making sure that, if one zone is down, the other zones have sufficient instances. Two instances in each zone provide a buffer during upgrades.", "type": "recommendation", - "guid": "1602d5e4-e1cb-4e3a-b74e-1636b8e789cd" + "guid": "498c8975-065f-40b0-986e-2a889c7a462d" + }, + { + "waf": "Reliability", + "service": "Virtual Machines", + "text": "(Scale set) To enhance service uptime while maintaining control over the cost implications of upgrades, enable MaxSurge.", + "description": "New instances are created in batches by using the latest scale model. After the new instances are healthy, the old instances are deleted in batches. This process continues until all instances are updated, which ensures no downtime during updates.", + "type": "recommendation", + "guid": "4dee4138-6fc5-443b-9231-be3bf0019130" }, { "waf": "Reliability", @@ -60,10 +76,10 @@ { "waf": "Security", "service": "Virtual Machines", - "text": "(Scale set) Choose VM SKUs with security features. For example, some SKUs support BitLocker encryption, and confidential computing provides encryption of data-in-use. Review the features to understand the limitations.", + "text": "(Scale set) Choose VM SKUs that have security features. For example, some SKUs support BitLocker encryption, and confidential computing provides encryption of data-in-use. Review the features to understand the limitations.", "description": "Azure-provided features are based on signals that are captured across many tenants and can protect resources better than custom controls. You can also use policies to enforce those controls.", "type": "recommendation", - "guid": "f82d63a6-accd-4021-8ff3-4774c4c4510d" + "guid": "fa4eedc6-d140-4879-b3cf-00304ab23324" }, { "waf": "Security", @@ -85,7 +101,7 @@ "waf": "Security", "service": "Virtual Machines", "text": "(VMs) Choose secure networking options for your VM's network profile. Don't directly associate public IP addresses to your VMs and don't enable IP forwarding. Ensure that all virtual network interfaces have an associated network security group.", - "description": "You can set segmentation controls in the networking profile. Attackers scan public IP addresses, which makes VMs vulnerable to threats.", + "description": "You can set segmentation controls in the networking profile. Attackers scan public IP addresses. This activity makes VMs vulnerable to threats.", "type": "recommendation", "guid": "15f0aff4-216c-4900-af7d-a5e43796590b" }, @@ -116,26 +132,34 @@ { "waf": "Cost", "service": "Virtual Machines", - "text": "(VMs, scale set) Evaluate the disk options that are associated with your VM's SKUs. Determine your performance needs while keeping in mind your storage capacity needs and accounting for fluctuating workload patterns. For example, the Azure Premium SSD v2 disk allows you to granularly adjust your performance independent of the disk's size.", - "description": "Some high-performance disk types offer extra cost optimization features and strategies. The Premium SSD v2 disk's adjustment capability can reduce costs because it provides high performance without overprovisioning, which could otherwise lead to underutilized resources.", + "text": "(Scale set) Mix regular VMs with spot virtual machines. Flexible orchestration lets you distribute spot virtual machines based on a specified percentage.", + "description": "Reduce compute infrastructure costs by applying the deep discounts of spot virtual machines.", "type": "recommendation", - "guid": "184fb2f7-0386-4a88-8487-95623a412bec" + "guid": "b1fdfb60-31c8-4ab9-8106-f62e863f6d31" }, { "waf": "Cost", "service": "Virtual Machines", - "text": "(Scale set) Mix regular VMs with spot virtual machines. Flexible orchestration lets you distribute spot virtual machines based on a specified percentage.", - "description": "Reduce compute infrastructure costs by applying the deep discounts of spot virtual machines.", + "text": "(Scale set) Reduce the number of VM instances when demand decreases. Set a scale-in policy based on criteria.", + "description": "Scaling in resources when they're not in use reduces the number of VMs that run in the scale set, which saves costs.", "type": "recommendation", - "guid": "b1fdfb60-31c8-4ab9-8106-f62e863f6d31" + "guid": "07c39210-9e39-46ee-b382-0f774e79551c" }, { "waf": "Cost", "service": "Virtual Machines", - "text": "(Scale set) Reduce the number of VM instances when demand decreases. Set a scale-in policy based on criteria. Stop VMs during off-hours. You can use the Azure Automation Start/Stop feature and configure it according to your business needs.", - "description": "Scaling in or stopping resources when they're not in use reduces the number of VMs running in the scale set, which saves costs. The Start/Stop feature is a low-cost automation option.", + "text": "(VMs) Stop VMs during off-hours. You can use the Azure Automation Start/Stop feature and configure it according to your business needs.", + "description": "The Start/Stop feature is a low-cost automation option that can significantly affect your idle instance costs.", "type": "recommendation", - "guid": "99a9ab5e-e84d-493d-854b-40eca32360ce" + "guid": "39800efc-c1ea-4447-9a6d-601404e6127f" + }, + { + "waf": "Cost", + "service": "Virtual Machines", + "text": "(VMs) Free up CPU resources by using Azure Boost.", + "description": "Offloading back-end virtualization processes frees up CPU resources for the guest virtual machines. This optimization results in improved performance. Azure Boost is only available on specific VMs, so ensure that you also choose VM sizes that have Azure Boost enabled.", + "type": "recommendation", + "guid": "b4a253ee-ed18-4ded-bbd2-4ef32bbb2837" }, { "waf": "Cost", @@ -157,7 +181,7 @@ "waf": "Operations", "service": "Virtual Machines", "text": "(Scale set) Keep your VMs up to date by setting an upgrade policy. We recommend rolling upgrades. However, if you need granular control, choose to upgrade manually. For Flexible orchestration, you can use Azure Update Manager.", - "description": "Security is the primary reason for upgrades. Security assurances for the instances shouldn't decay over time. Rolling upgrades are done in batches, which ensures all instances aren't down at the same time.", + "description": "Security is the primary reason for upgrades. Security assurances for the instances shouldn't decay over time. Rolling upgrades are done in batches. This approach ensures that all instances aren't down at the same time.", "type": "recommendation", "guid": "62e3e643-0661-4c0d-aa35-9066eb12e56e" }, @@ -172,26 +196,26 @@ { "waf": "Operations", "service": "Virtual Machines", - "text": "Install prebuilt software components as extensions as part of bootstrapping. Azure supports many extensions that can be used to configure, monitor, secure, and provide utility applications for your VMs. Enable automatic upgrades on extensions.", + "text": "Install prebuilt software components as extensions as part of bootstrapping. Azure supports many extensions that can be used to configure, monitor, secure, and provide utility applications for your VMs. Enable automatic upgrades on extensions.", "description": "Extensions can help simplify the software installation at scale without you having to manually install, configure, or upgrade it on each VM.", "type": "recommendation", - "guid": "7177b0fd-461c-49e1-9183-f1ee9986c5b6" + "guid": "581d30c0-dff3-48a4-96d3-73b6cebeca2e" }, { "waf": "Operations", "service": "Virtual Machines", - "text": "(VMs, scale set) Monitor and measure the health of the VM instances. Deploy the Monitor agent extension to your VMs to collect monitoring data from the guest OS with OS-specific data collection rules. Enable VM insights to monitor health and performance and to view trends from the collected data. Use boot diagnostics to get information as VMs boot. Boot diagnostics also diagnose boot failures.", - "description": "Monitoring data is at the core of incident resolution. A comprehensive monitoring stack provides information about how the VMs are performing and their health. By continuously monitoring the instances, you can be ready for or prevent failures like performance overload and reliability issues.", + "text": "(VMs, scale set) Monitor and measure the health of the VM instances. Deploy the Monitor agent extension to your VMs to collect monitoring data from the guest OS with OS-specific data collection rules. Enable VM insights to monitor health and performance and to view trends from the collected data. Use boot diagnostics to get information as VMs boot. Boot diagnostics also diagnose boot failures.", + "description": "Monitoring data is at the core of incident resolution. A comprehensive monitoring stack provides information about how the VMs are performing and their health. By continuously monitoring the instances, you can be ready for or prevent failures like performance overload and reliability problems.", "type": "recommendation", - "guid": "4d2f720c-f0d3-4d39-9258-b212ca1c8a99" + "guid": "9e2e967b-87ee-4358-8434-ac9ae26e721d" }, { "waf": "Performance", "service": "Virtual Machines", - "text": "(VMs, scale set) Choose SKUs for VMs that align with your capacity planning. Have a good understanding of your workload requirements, including the number of cores, memory, storage, and network bandwidth so that you can filter out unsuitable SKUs.", - "description": "Rightsizing your VMs is a fundamental decision that significantly affects the performance of your workload. Without the right set of VMs, you might experience performance issues and accrue unnecessary costs.", + "text": "(VMs, scale set) Choose SKUs for VMs that align with your capacity planning. Have a good understanding of your workload requirements, including the number of cores, memory, storage, and network bandwidth so that you can filter out unsuitable SKUs.", + "description": "Rightsizing your VMs is a fundamental decision that significantly affects the performance of your workload. Without the right set of VMs, you might experience performance problems and accrue unnecessary costs.", "type": "recommendation", - "guid": "e6e6f3f5-7632-4f17-975e-7b35d09b38b4" + "guid": "f5e2057e-e0e8-494d-8e18-b03477f7784c" }, { "waf": "Performance", @@ -201,14 +225,6 @@ "type": "recommendation", "guid": "07d79869-9589-4d6a-9e5b-cb0f27cf4b48" }, - { - "waf": "Performance", - "service": "Virtual Machines", - "text": "(VMs, scale set) Set the storage profile by analyzing the disk performance of existing workloads and the VM SKU. Use Premium SSDs for production VMs. Adjust the performance of disks with Premium SSD v2. Use locally attached NVMe devices.", - "description": "Premium SSDs deliver high-performance and low-latency disk support VMs with I/O-intensive workloads. Premium SSD v2 doesn't require disk resizing, which enables high performance without excessive over-provisioning and minimizes the cost of unused capacity. When available on VM SKUs, locally attached NVMe or similar devices can offer high performance, especially for use cases that require high input/output operations per second (IOPS) and low latency.", - "type": "recommendation", - "guid": "e0902a54-a13c-4831-8dbe-c013010044f4" - }, { "waf": "Performance", "service": "Virtual Machines", @@ -229,34 +245,34 @@ "categories": [], "waf": [ { - "name": "reliability" + "name": "Security" }, { - "name": "Cost" + "name": "Reliability" }, { - "name": "Operations" + "name": "security" }, { - "name": "security" + "name": "reliability" }, { - "name": "Performance" + "name": "operations" }, { - "name": "Reliability" + "name": "Operations" }, { - "name": "operations" + "name": "Performance" }, { - "name": "Security" + "name": "cost" }, { - "name": "performance" + "name": "Cost" }, { - "name": "cost" + "name": "performance" } ], "yesno": [ @@ -293,6 +309,6 @@ "name": "Virtual Machines Service Guide", "waf": "all", "state": "preview", - "timestamp": "October 20, 2024" + "timestamp": "May 04, 2025" } } \ No newline at end of file diff --git a/checklists-ext/wafsg_checklist.en.json b/checklists-ext/wafsg_checklist.en.json index f08136c8..ae551349 100644 --- a/checklists-ext/wafsg_checklist.en.json +++ b/checklists-ext/wafsg_checklist.en.json @@ -4,10 +4,10 @@ { "waf": "reliability", "service": "App Service Web Apps", - "text": "Prioritize user flows: Not all flows are equally critical. Assign priorities to each flow to guide your design decisions. User flow design can influence which service tiers and number of instances that you choose for an App Service plan and configuration.", + "text": "Prioritize user flows: Not all flows are equally important. Identify the critical paths in your application and assign priorities to each flow to guide your design decisions. User flow design can influence which service tiers and the number of instances that you choose for an App Service plan and configuration.", "description": "", "type": "checklist", - "guid": "a549f1f2-55ec-4094-b8c5-1be6fd4f4d38" + "guid": "8e49fd91-ee1f-46ba-b1aa-ee68f0c0123a" }, { "waf": "reliability", @@ -20,18 +20,34 @@ { "waf": "reliability", "service": "App Service Web Apps", - "text": "Build redundancy: Build redundancy in the application and supporting infrastructure. Spread instances across availability zones to improve fault tolerance. Traffic is routed to other zones if one zone fails. Deploy your application across multiple regions to ensure that your app remains available, even if an entire region experiences an outage.", + "text": "Build redundancy: Build redundancy in the application and supporting infrastructure. Spread instances across availability zones to improve fault tolerance. Traffic is routed to other zones if one zone fails. Deploy your application across multiple regions to help ensure that your app remains available, even if an entire region experiences an outage.", "description": "", "type": "checklist", - "guid": "b47d664a-75c8-4739-80d8-6cf9f60abb56" + "guid": "24eccd93-64f9-4095-bfd0-3411e7ed709e" }, { "waf": "reliability", "service": "App Service Web Apps", - "text": "Have a reliable scaling strategy: Unexpected load on an application can make it unreliable. Consider the right scaling approach based on your workload characteristics. You can sometimes scale up to handle the load. However, if the load continues to increase, scale out to new instances. Prefer automatic scaling over manual approaches. Always maintain a buffer of extra capacity during scaling operations to prevent performance degradation.", + "text": "Use multiple instances: An immediate single-point-of-failure occurs if you run your app on only one instance. Allocate multiple instances to your app to help ensure high availability. If one instance fails, other instances can still handle incoming requests.\u202fYour app code should be able to handle multiple instances\u202fwithout synchronization issues when reading from data sources or writing to data sources.", "description": "", "type": "checklist", - "guid": "6b8fa1ed-cfa0-4435-9146-b3d1a8edb432" + "guid": "cbaa325e-ee6d-4a1d-8efa-5f7084f238d0" + }, + { + "waf": "reliability", + "service": "App Service Web Apps", + "text": "Have a reliable scaling strategy: Unexpected load on an application can make it unreliable. Consider the right scaling approach based on your workload characteristics. Horizontal scaling, or scaling out, allows you to add more instances to distribute the load. Vertical scaling, or scaling up, increases the capacity of an existing instance, such as CPU or memory. Be cautious of over-provisioning because adding unnecessary instances increases costs without tangible performance benefits.", + "description": "", + "type": "checklist", + "guid": "56efb512-0004-4167-80d2-6ff63bd746b1" + }, + { + "waf": "reliability", + "service": "App Service Web Apps", + "text": "Ensure proper app initialization so that new instances warm up quickly and can receive requests. Strive for stateless applications when possible. Reliably scaling state with new instances can increase complexity. Consider an external data store that you can scale independently if you need to store application state. Storing session state in memory can result in losing session state when there's a problem with the application or App Service. It also limits the possibility of spreading the load across other instances.", + "description": "", + "type": "checklist", + "guid": "fd63ca51-7498-4bfa-9dbc-a91a3dfb3f3b" }, { "waf": "reliability", @@ -52,50 +68,66 @@ { "waf": "reliability", "service": "App Service Web Apps", - "text": "Use health probes to identify unresponsive workers: App Service has built-in capabilities that periodically ping a specific path of your web application. Unresponsive instances are removed from the load balancer and replaced with a new instance.", + "text": "Use the health check feature to identify unresponsive workers: App Service has built-in capabilities that periodically ping a specific path of your web application. The platform pings this path to determine whether your application is healthy and responds to requests.", + "description": "", + "type": "checklist", + "guid": "e7583a41-f0ef-44e7-bd1b-a87b658faa35" + }, + { + "waf": "reliability", + "service": "App Service Web Apps", + "text": "Use the auto-heal feature: Sometimes your application might experience unexpected behaviors that a simple restart can solve. Use the auto-heal feature to define a condition that triggers auto-heal and the action that auto-heal initiates when that condition is met. For more information, see App Service diagnostics overview.", "description": "", "type": "checklist", - "guid": "dcceb232-19d6-48dc-a91f-5b4907cc3dda" + "guid": "2c212d74-0dc6-4172-9da4-a10da16a6057" + }, + { + "waf": "reliability", + "service": "App Service Web Apps", + "text": "App Service resiliency score report: To review tailored best practice recommendations, see the App Service diagnostics overview.", + "description": "", + "type": "checklist", + "guid": "4a0c48ec-c629-4f9c-b366-7e441e239f34" }, { "waf": "Reliability", "service": "App Service Web Apps", - "text": "(App Service plan) Choose the Premium tier of an App Service plan for production workloads. Set the maximum and minimum number of workers according to your capacity planning. For more information, see App Service plan overview.", - "description": "A premium App Service plan offers advanced scaling features and ensures redundancy if failures occur.", + "text": "(App Service) Choose the Premium v3 tier of an App Service plan for production workloads. Set the maximum and minimum number of workers according to your capacity planning. For more information, see App Service plan overview.", + "description": "A Premium v3 App Service plan provides advanced scaling features and ensures redundancy if failures occur.", "type": "recommendation", - "guid": "696fd187-e2a8-45e8-bbe7-a6d3cb1fca62" + "guid": "cbedd1e8-d809-4059-ae37-a33419f31016" }, { "waf": "Reliability", "service": "App Service Web Apps", - "text": "(App Service plan) Enable zone redundancy. Consider provisioning more than three instances to enhance fault tolerance. Check regional support for zone redundancy because not all regions offer this feature.", + "text": "(App Service) Enable zone redundancy. Consider provisioning more than three instances to enhance fault tolerance. Check regional support for zone redundancy because not all regions have this feature.", "description": "Your application can withstand failures in a single zone when multiple instances are spread across zones. Traffic automatically shifts to healthy instances in other zones and maintains application reliability if one zone is unavailable.", "type": "recommendation", - "guid": "830faa5e-7767-4526-8550-547a345dd398" + "guid": "8313cf7d-c790-4b71-9821-f37e4ad92b81" }, { "waf": "Reliability", "service": "App Service Web Apps", - "text": "(App Service) Consider disabling the application request routing (ARR) affinity feature. ARR affinity creates sticky sessions that redirect users to the node that handled their previous requests.", - "description": "Incoming requests are evenly distributed across all available nodes when you disable ARR affinity. Evenly distributed requests prevent traffic from overwhelming any single node. Requests can be seamlessly redirected to other healthy nodes if a node is unavailable. Avoid session affinity to ensure that your App Service instance remains stateless. A stateless App Service reduces complexity and ensures consistent behavior across nodes. Remove sticky sessions so that App Service can add or remove instances to scale horizontally.", + "text": "(Web Apps) Consider disabling the application request routing (ARR) affinity feature. ARR affinity creates sticky sessions that redirect users to the node that handled their previous requests.", + "description": "Incoming requests are evenly distributed across all available nodes when you disable ARR affinity. Evenly distributed requests prevent traffic from overwhelming any single node. Requests can be seamlessly redirected to other healthy nodes if a node is unavailable. Avoid session affinity to ensure that your App Service instance remains stateless. A stateless App Service instance reduces complexity and ensures consistent behavior across nodes. Remove sticky sessions so that App Service can add or remove instances to scale horizontally.", "type": "recommendation", - "guid": "3551bbd4-1f44-4bca-b25c-53e16f79cafe" + "guid": "96405930-bc46-4741-ac5e-2d3446c1918c" }, { "waf": "Reliability", "service": "App Service Web Apps", - "text": "(App Service) Define automatic healing rules based on request count, slow requests, memory limits, and other indicators that are part of your performance baseline. Consider this configuration as part of your scaling strategy.", + "text": "(Web Apps) Define automatic healing rules based on request count, slow requests, memory limits, and other indicators that are part of your performance baseline. Consider this configuration as part of your scaling strategy.", "description": "Automatic healing rules help your application recover automatically from unexpected problems. The configured rules trigger healing actions when thresholds are breached. Automatic healing enables automatic proactive maintenance.", "type": "recommendation", - "guid": "2a428ce1-dc25-4a1b-baf4-5edc909369fe" + "guid": "8492f140-120a-49f8-94c7-f56a4b5970bc" }, { "waf": "Reliability", "service": "App Service Web Apps", - "text": "(App Service) Enable the health check feature and provide a path that responds to the health check requests.", + "text": "(Web Apps) Enable the health check feature and provide a path that responds to the health check requests.", "description": "Health checks can detect problems early. Then the system can automatically take corrective actions when a health check request fails. The load balancer routes traffic away from unhealthy instances, which directs users to healthy nodes.", "type": "recommendation", - "guid": "4dd04acd-7d69-45b8-aa22-223c4ecc0a8c" + "guid": "c432b38c-4ddb-476d-96ca-eea039bd2d8b" }, { "waf": "security", @@ -108,50 +140,74 @@ { "waf": "security", "service": "App Service Web Apps", - "text": "Use the latest runtime and libraries: Thoroughly test your application builds before you do updates to catch problems early and ensure a smooth transition to the new version. App Service supports the language runtime support policy for updating existing stacks and retiring end-of-support stacks.", + "text": "Use the latest runtime and libraries: Thoroughly test your application builds before you do updates to catch problems early and help ensure a smooth transition to the new version. App Service supports the language runtime support policy for updating existing stacks and retiring end-of-support stacks.", + "description": "", + "type": "checklist", + "guid": "4408a541-7656-46a5-a944-d92779c59590" + }, + { + "waf": "security", + "service": "App Service Web Apps", + "text": "Create segmentation through isolation boundaries to contain breaches: Apply identity segmentation. For example, implement role-based access control (RBAC) to assign specific permissions based on roles. Follow the principle of least privilege to limit access rights to only what's necessary. Also create segmentation at the network level. Integrate App Service apps with an Azure virtual network for isolation and define network security groups (NSGs) to filter traffic.", + "description": "", + "type": "checklist", + "guid": "0411a3f1-208f-4571-a881-866948e78832" + }, + { + "waf": "security", + "service": "App Service Web Apps", + "text": "Apply access controls on identities: Restrict inward access to the web app and outward access from the web app to other resources. This configuration applies access controls on identities and helps maintain the workload's overall security posture.", + "description": "", + "type": "checklist", + "guid": "35490d6e-035c-40e0-b2ec-9015d439a8a2" + }, + { + "waf": "security", + "service": "App Service Web Apps", + "text": "Apply network security controls: Integrate your App Service with a virtual network to control the outbound traffic. Use private endpoints to control inbound traffic, limit access to your App Service instance from within your virtual network, and disable public internet access. For more information, see Network routing.", "description": "", "type": "checklist", - "guid": "cedf41d1-6f8a-4898-9cbd-064e66931bfa" + "guid": "1abe0979-3a8a-4488-9359-c17a65a9055d" }, { "waf": "security", "service": "App Service Web Apps", - "text": "Create segmentation through isolation boundaries to contain breach: Apply identity segmentation. For example, implement role-based access control (RBAC) to assign specific permissions based on roles. Follow the principle of least privilege to limit access rights to only what's necessary. Also create segmentation at the network level. Inject App Service apps in an Azure virtual network for isolation and define network security groups (NSGs) to filter traffic.", + "text": "Encrypt data: Help protect data in transit by using end-to-end Transport Layer Security (TLS). Use your customer-managed keys for full encryption of data at rest.", "description": "", "type": "checklist", - "guid": "469151fd-f51c-43ab-84f7-5d6c01e99fba" + "guid": "51d8015d-7bc4-4cae-b385-fcb63ec87bc3" }, { "waf": "security", "service": "App Service Web Apps", - "text": "Apply access controls on identities: Restrict both inward access to the web app and outward access from the web app to other resources. This configuration applies access controls on identities and helps maintain the workload's overall security posture.", + "text": "End-to-end TLS encryption: End-to-end TLS encryption is available in premium App Service plans. This feature encrypts your traffic throughout the entire transaction, which minimizes the risk of traffic interception.", "description": "", "type": "checklist", - "guid": "794e1680-facc-4258-89cb-a07f1fa6e4d0" + "guid": "fbeb9cf9-3f31-48fc-87e1-3d47f580529b" }, { "waf": "security", "service": "App Service Web Apps", - "text": "Control network traffic to and from the application: Don't expose application endpoints to the public internet. Instead, add a private endpoint on the web app that's placed in a dedicated subnet. Front your application with a reverse proxy that communicates with that private endpoint. Consider using Application Gateway or Azure Front Door for that purpose.", + "text": "Reduce the attack surface: Remove default configurations that you don't need. For example, disable remote debugging, local authentication for Source Control Manager (SCM) sites, and basic authentication. Disable unsecured protocols like HTTP and File Transfer Protocol (FTP). Enforce configurations by using Azure policies. For more information, see Azure policies.", "description": "", "type": "checklist", - "guid": "0ff3e194-df50-4856-b798-7487e5fc4d34" + "guid": "d5f8bc9d-86d7-44f8-ae5e-bebebf7efae6" }, { "waf": "security", "service": "App Service Web Apps", - "text": "Encrypt data: Protect data in transit with end-to-end Transport Layer Security (TLS). Use your customer-managed keys for full encryption of data at rest. For more information, see Encryption at rest using customer-managed keys.", + "text": "Implement restrictive cross-origin resource sharing (CORS) policies: Use restrictive CORS policies in your web app to only accept requests from the allowed domains, headers, and other criteria. Enforce CORS policies by using built-in Azure policy definitions.", "description": "", "type": "checklist", - "guid": "a6fcbd07-0da4-4386-902d-6708c5a5cc23" + "guid": "f1903d27-6b11-4be8-b210-31f71abd139f" }, { "waf": "security", "service": "App Service Web Apps", - "text": "Reduce the attack surface: Remove default configurations that you don't need. For example, disable remote debugging, local authentication for Source Control Manager (SCM) sites, and basic authentication. Disable unsecure protocols like HTTP and File Transfer Protocol (FTP). Enforce configurations through Azure policies. For more information, see Azure policies.", + "text": "Use managed identities: Enable managed identities for your App Service instance to more securely access other Azure services without needing to manage credentials.", "description": "", "type": "checklist", - "guid": "410bf979-2e37-4dfc-82ba-33dc8e9fbbf0" + "guid": "4d77b8e3-16ba-46e2-b509-57a1d7909e91" }, { "waf": "security", @@ -164,82 +220,82 @@ { "waf": "security", "service": "App Service Web Apps", - "text": "Enable resource logs for your application: Enable resource logs for your application to create comprehensive activity trails that provide valuable data during investigations that follow security incidents.", + "text": "Enable resource logs for your application: Enable resource logs for your application to create comprehensive activity trails that provide valuable data during investigations that follow security incidents. For more information, see Azure Monitor resource logs.", "description": "", "type": "checklist", - "guid": "ae9e2e30-a31e-4459-ae31-fbcb8307016e" + "guid": "0fb99e4f-ae69-47fc-8157-c4f39d6a162d" }, { "waf": "Security", "service": "App Service Web Apps", - "text": "(App Service) Assign managed identities to the web app. To maintain isolation boundaries, don't share or reuse identities across applications. Make sure that you securely connect to your container registry if you use containers for your deployment.", + "text": "(Web Apps) Assign managed identities to the web app. To maintain isolation boundaries, don't share or reuse identities across applications. Make sure that you securely connect to your container registry if you use containers for your deployment.", "description": "The application retrieves secrets from Key Vault to authenticate outward communication from the application. Azure manages the identity and doesn't require you to provision or rotate any secrets. You have distinct identities for granularity of control. Distinct identities make revocation easy if an identity is compromised.", "type": "recommendation", - "guid": "8f1a9e86-9309-4e41-bde5-0828c15daebf" + "guid": "5beee06c-df3f-4b75-982c-9d2cfc1f486b" }, { "waf": "Security", "service": "App Service Web Apps", - "text": "(App Service) Configure custom domains for applications. Disable HTTP and only accept HTTPS requests.", - "description": "Custom domains enable secure communication through HTTPS using Transport Layer Security (TLS) protocol, which ensures the protection of sensitive data and builds user trust.", + "text": "(Web Apps) Configure custom domains for applications. Disable HTTP and only accept HTTPS requests.", + "description": "Custom domains enable secure communication through HTTPS by using TLS protocol, which helps ensure the protection of sensitive data and builds user trust.", "type": "recommendation", - "guid": "cb4271e8-40e1-4cd3-ad1e-5d4584c485d7" + "guid": "b4dd4550-dd08-4bec-a885-9d6d8597d93e" }, { "waf": "Security", "service": "App Service Web Apps", - "text": "(App Service) valuate whether App Service built-in authentication is the right mechanism to authenticate users that access your application. App Service built-in authentication integrates with Microsoft Entra ID. This feature handles token validation and user identity management across multiple sign-in providers and supports OpenID Connect. With this feature, you don't have authorization at a granular level, and you don't have a mechanism to test authentication.", + "text": "(Web Apps) Evaluate whether App Service built-in authentication is the right mechanism to authenticate users that access your application. App Service built-in authentication integrates with Microsoft Entra ID. This feature handles token validation and user identity management across multiple sign-in providers and supports OpenID Connect. With this feature, you don't have authorization at a granular level, and you don't have a mechanism to test authentication.", "description": "When you use this feature, you don't have to use authentication libraries in application code, which reduces complexity. The user is already authenticated when a request reaches the application.", "type": "recommendation", - "guid": "0e19e111-ccad-457f-94b5-ee2deebc553c" + "guid": "537afd69-9d35-44bc-9e50-0a79cee5229d" }, { "waf": "Security", "service": "App Service Web Apps", - "text": "(App Service) Configure the application for virtual network integration. Use private endpoints for App Service apps. Block all public traffic. Route the container image pull through the virtual network integration. All outgoing traffic from the application passes through the virtual network.", + "text": "(Web Apps) Configure the application for virtual network integration. Use private endpoints for App Service apps. Block all public traffic. Route the container image pull through the virtual network integration. All outgoing traffic from the application passes through the virtual network.", "description": "Get the security benefits of using an Azure virtual network. For example, the application can securely access resources within the network. Add a private endpoint to help protect your application. Private endpoints limit direct exposure to the public network and allow controlled access through the reverse proxy.", "type": "recommendation", - "guid": "1835b31e-37b4-431a-b783-eb28fec46518" + "guid": "bae50fb0-744d-4a13-bc8a-64737a69776f" }, { "waf": "Security", "service": "App Service Web Apps", - "text": "(App Service) To implement hardening: - Disable basic authentication that uses a username and password in favor of Microsoft Entra ID-based authentication. - Turn off remote debugging so that inbound ports aren't opened. - Enable CORS policies to tighten incoming requests. - Disable protocols, such as FTP.", - "description": "We don't recommend basic authentication as a secure deployment method. Microsoft Entra ID employs OAuth 2.0 token-based authentication, which offers numerous advantages and enhancements that address the limitations that are associated with basic authentication. Policies restrict access to application resources, only allow requests from specific domains, and secure cross-region requests.", + "text": "(Web Apps) To implement hardening: - Disable basic authentication that uses a username and password in favor of Microsoft Entra ID-based authentication. - Turn off remote debugging so that inbound ports aren't opened. - Enable CORS policies to tighten incoming requests. - Disable protocols, such as FTP.", + "description": "We don't recommend basic authentication as a secure deployment method. Microsoft Entra ID employs OAuth 2.0 token-based authentication, which provides numerous advantages and enhancements that address the limitations that are associated with basic authentication. Policies restrict access to application resources, only allow requests from specific domains, and secure cross-region requests.", "type": "recommendation", - "guid": "07adcdee-eecc-43c4-9cf8-40e06efa96cc" + "guid": "c538c734-d159-46b2-a49a-2e2f94e7196c" }, { "waf": "Security", "service": "App Service Web Apps", - "text": "(App Service) Always use Key Vault references as app settings.", + "text": "(Web Apps) Always use Key Vault references as app settings.", "description": "Secrets are kept separate from your app's configuration. App settings are encrypted at rest. App Service also manages secret rotations.", "type": "recommendation", - "guid": "298db56c-a733-4a8c-a009-224a1417de53" + "guid": "39302a22-e583-488d-9dc2-da69b427e111" }, { "waf": "Security", "service": "App Service Web Apps", - "text": "(App Service plan) Enable Microsoft Defender for Cloud for App Service.", + "text": "(App Service) Enable Microsoft Defender for Cloud for App Service.", "description": "Get real-time protection for resources that run in an App Service plan. Guard against threats and enhance your overall security posture.", "type": "recommendation", - "guid": "6cd4ee41-8ebd-4a7e-9da4-793705575ea0" + "guid": "9473e705-2b53-40a2-b307-6605a206a011" }, { "waf": "Security", "service": "App Service Web Apps", - "text": "(App Service plan) Enable diagnostic logging and add instrumentation to your app. The logs are sent to Azure Storage accounts, Azure Event Hubs, and Log Analytics. For more information about audit log types, see Supported log types.", + "text": "(App Service) Enable diagnostic logging and add instrumentation to your app. The logs are sent to Azure Storage accounts, Azure Event Hubs, and Log Analytics. For more information about audit log types, see Supported log types.", "description": "Logging captures access patterns. It records relevant events that provide valuable insights into how users interact with an application or platform. This information is crucial for accountability, compliance, and security purposes.", "type": "recommendation", - "guid": "93a4781a-8e68-4ad4-af67-4ae28d534e6e" + "guid": "843684a2-dd37-486b-b36f-0a991d9836b5" }, { "waf": "cost", "service": "App Service Web Apps", - "text": "Estimate the initial cost: As part of your cost modeling exercise, use the Azure pricing calculator to evaluate the approximate costs associated with various tiers based on the number of instances that you plan to run. Each App Service tier offers different compute options.", + "text": "Estimate the initial cost: As part of your cost modeling exercise, use the Azure pricing calculator to evaluate the approximate costs associated with various tiers based on the number of instances that you plan to run. Each App Service tier provides different compute options.", "description": "", "type": "checklist", - "guid": "bb84a1f1-7f43-4656-bc0d-70abc288979b" + "guid": "5ef5d641-5a93-4ad3-ad4b-805d48d3128d" }, { "waf": "cost", @@ -252,10 +308,10 @@ { "waf": "cost", "service": "App Service Web Apps", - "text": "Understand usage meters: Azure charges an hourly rate, prorated to the second, based on your App Service plan's pricing tier. Charges apply to each scaled-out instance in your plan, based on the time that you allocate the VM instance. Pay attention to underused compute resources that might increase your costs as a result of overallocation due to suboptimal SKU selection, or poorly configured scale-in configuration.", + "text": "Understand usage meters: Azure charges an hourly rate, prorated to the second, based on your App Service plan's pricing tier. Charges apply to each scaled-out instance in your plan, based on the time that you allocate the VM instance. Pay attention to underused compute resources that might increase your costs as a result of overallocation because of suboptimal SKU selection, or poorly configured scale-in configuration.", "description": "", "type": "checklist", - "guid": "08ccd533-7d3f-438e-8274-a30d3e10d81e" + "guid": "722c70d0-5d16-4bfb-8dc7-f0ec9811c289" }, { "waf": "cost", @@ -268,34 +324,34 @@ { "waf": "cost", "service": "App Service Web Apps", - "text": "Evaluate the effect of your scaling strategy on cost: You must properly design, test, and configure for scaling out and for scaling in when you implement autoscaling. Establish precise maximum and minimum limits on autoscaling.", + "text": "Evaluate the effects of your scaling strategy on cost: You must properly design, test, and configure for scaling out and for scaling in when you implement autoscaling. Establish precise maximum and minimum limits on autoscaling.", "description": "", "type": "checklist", - "guid": "19d66260-5bfe-466e-89b0-f2573223acf9" + "guid": "82e088c1-ca25-4d09-80cf-18d63dc27fdb" }, { "waf": "cost", "service": "App Service Web Apps", - "text": "Optimize environment costs: Consider the Basic or Free tier to run pre-production environments. These tiers are low performance and low cost. If you use the Basic or Free tier, use governance to enforce the tier, constrain the number of instances and CPUs, restrict scaling, and limit log retention.", + "text": "Optimize environment costs: Consider using the basic tier or free tier to run pre-production environments. These tiers are low performance and low cost. If you use the basic tier or free tier, use governance to enforce the tier, constrain the number of instances and CPUs, restrict scaling, and limit log retention.", "description": "", "type": "checklist", - "guid": "b4798bde-d65a-4c44-a75e-e7c8c05df60e" + "guid": "745c855f-5b5d-4aa0-98f7-58a3cdc06d9e" }, { "waf": "cost", "service": "App Service Web Apps", - "text": "Implement design patterns: This strategy reduces the volume of requests that your workload generates. Consider using patterns like the Backends for Frontends pattern and the Gateway Aggregation pattern, which can minimize the number of requests and reduce costs.", + "text": "Implement design patterns: This strategy reduces the volume of requests that your workload generates. Consider using patterns like the Backends for Frontends pattern and the Gateway Aggregation pattern to minimize the number of requests and reduce costs.", "description": "", "type": "checklist", - "guid": "0cfc5cbc-61b7-4b64-a2eb-52e15c243cad" + "guid": "2db54c1f-e76b-47f0-90e2-77e0e1c12e50" }, { "waf": "cost", "service": "App Service Web Apps", - "text": "Regularly check data-related costs: Extended data retention periods or expensive storage tiers can lead to high storage costs. More expenses can accumulate due to both bandwidth usage and prolonged retention of logging data.", + "text": "Regularly check data-related costs: Extended data retention periods or expensive storage tiers can result in high storage costs. More expenses can accumulate because of bandwidth usage and prolonged retention of logging data.", "description": "", "type": "checklist", - "guid": "300e19c7-7406-47ca-aeca-bca5032f2b8a" + "guid": "f14a6304-4ba9-41f4-b19d-293625a31870" }, { "waf": "cost", @@ -308,34 +364,34 @@ { "waf": "Cost", "service": "App Service Web Apps", - "text": "(App Service plan) Choose Free or Basic tiers for lower environments. We recommend these tiers for experimental use. Remove the tiers when you no longer need them.", - "description": "The Free and Basic tiers are budget-friendly compared to higher tiers. They provide a cost-effective solution for nonproduction environments that don't need the full features and performance of premium plans.", + "text": "(App Service) Choose free tiers or basic tiers for lower environments. We recommend these tiers for experimental use. Remove the tiers when you no longer need them.", + "description": "The free tiers and basic tiers are budget-friendly compared to higher tiers. They provide a cost-effective solution for nonproduction environments that don't need the full features and performance of premium plans.", "type": "recommendation", - "guid": "df7294a6-460e-4d50-b4a9-66141caada1d" + "guid": "8198b56e-5245-454e-82fc-7f2ba732661a" }, { "waf": "Cost", "service": "App Service Web Apps", - "text": "(App Service plan) Take advantage of discounts and explore preferred pricing for: - Lower environments with dev/test plans. - Azure reservations and Azure savings plans for dedicated compute that you provision in the Premium V3 tier and App Service Environment. Use reserved instances for stable workloads that have predictable usage patterns.", - "description": "Dev/test plans provide reduced rates for Azure services, which makes them cost-effective for nonproduction environments. Use reserved instances to prepay for compute resources and get significant discounts.", + "text": "(App Service) Take advantage of discounts and explore preferred pricing for: - Lower environments with dev/test plans. - Azure reservations and Azure savings plans for dedicated compute that you provision in the Premium v3 tier and App Service Environment. Use reserved instances for stable workloads that have predictable usage patterns.", + "description": "Dev/test plans provide reduced rates for Azure services, which make them cost-effective for nonproduction environments. Use reserved instances to prepay for compute resources and get significant discounts.", "type": "recommendation", - "guid": "a4bd3bc2-554f-483f-9707-dcf9f1b7cdd2" + "guid": "83694515-338b-4683-8da4-a2413bfee3c2" }, { "waf": "Cost", "service": "App Service Web Apps", - "text": "(App Service) Monitor costs that App Service resources incur. Run the cost analysis tool in the Azure portal. Create budgets and alerts to notify stakeholders.", + "text": "(Web Apps) Monitor costs that App Service resources incur. Run the cost analysis tool in the Azure portal. Create budgets and alerts to notify stakeholders.", "description": "You can identify cost spikes, inefficiencies, or unexpected expenses early on. This proactive approach helps you to provide budgetary controls to prevent overspending.", "type": "recommendation", - "guid": "12f02b69-d400-403d-9e8a-1a67ed102764" + "guid": "5b59326f-bc0e-4475-880d-ec40007ff189" }, { "waf": "Cost", "service": "App Service Web Apps", - "text": "(App Service plan) Scale in when demand decreases. To scale in, define scale rules to reduce the number of instances in Azure Monitor.", + "text": "(App Service) Scale in when demand decreases. To scale in, define scale rules to reduce the number of instances in Azure Monitor.", "description": "Prevent wastage and reduce unnecessary expenses.", "type": "recommendation", - "guid": "e48373a3-b551-4715-853c-dee4f114466e" + "guid": "9a7c2a72-f881-41e5-b467-f9662d18609c" }, { "waf": "operations", @@ -348,10 +404,18 @@ { "waf": "operations", "service": "App Service Web Apps", - "text": "Run automated tests: Before you promote a release of your web app, thoroughly test its performance, functionality, and integration with other components. Use Azure Load Testing, which integrates with Apache JMeter, a popular tool for performance testing. Explore automated tools for other types of testing, such as Phantom for functional testing.", + "text": "Run automated tests: Before you promote a release of your web app, thoroughly test its performance, functionality, and integration with other components. Use Azure Load Testing. It integrates with Apache JMeter, a popular tool for performance testing. Explore automated tools for other types of testing, such as Phantom for functional testing.", "description": "", "type": "checklist", - "guid": "751098a9-7103-4250-bd52-f48a5fbe8b73" + "guid": "921d3792-5521-47d2-8c33-babb869b69fa" + }, + { + "waf": "operations", + "service": "App Service Web Apps", + "text": "Automate deployments: Use continuous integration and continuous deployment pipelines with Azure DevOps or GitHub Actions to automate deployments and reduce manual effort. For more information, see Continuous deployment to App Service.", + "description": "", + "type": "checklist", + "guid": "5b565081-2c47-4322-bf4b-261d10abc310" }, { "waf": "operations", @@ -364,10 +428,10 @@ { "waf": "operations", "service": "App Service Web Apps", - "text": "Keep production environments safe: Create separate App Service plans to run production and pre-production environments. Don't make changes directly in the production environment to ensure stability and reliability. Separate instances allow flexibility in development and testing before you promote changes to production.", + "text": "Keep production environments safe: Create separate App Service plans to run production and pre-production environments. Don't make changes directly in the production environment to help ensure stability and reliability. Separate instances allow flexibility in development and testing before you promote changes to production.", "description": "", "type": "checklist", - "guid": "3175ebdb-2846-4f48-b450-278df163c188" + "guid": "0bb81ca8-62a2-40fb-9438-04253ccb8add" }, { "waf": "operations", @@ -380,34 +444,34 @@ { "waf": "Operations", "service": "App Service Web Apps", - "text": "(App Service) Monitor the health of your instances and activate instance health probes. Set up a specific path for handling health probe requests.", + "text": "(Web Apps) Monitor the health of your instances and activate instance health probes. Set up a specific path for handling health probe requests.", "description": "You can detect problems promptly and take necessary actions to maintain availability and performance.", "type": "recommendation", - "guid": "75f6aad9-2abc-4171-9715-ba708928d39a" + "guid": "60426301-16bf-4dd5-a419-ad2e9718a3fe" }, { "waf": "Operations", "service": "App Service Web Apps", - "text": "(App Service) Enable diagnostics logs for the application and the instance. Frequent logging can slow down the performance of the system, add to storage costs, and introduce risk if you have unsecure access to logs. Follow these best practices: - Log the right level of information. - Set retention policies. - Keep an audit trail of authorized access and unauthorized attempts. - Treat logs as data and apply data-protection controls.", + "text": "(Web Apps) Enable diagnostics logs for the application and the instance. Frequent logging can slow down the performance of the system, add to storage costs, and introduce risk if you have unsecure access to logs. Follow these best practices: - Log the right level of information. - Set retention policies. - Keep an audit trail of authorized access and unauthorized attempts. - Treat logs as data and apply data-protection controls.", "description": "Diagnostic logs provide valuable insights into your app's behavior. Monitor traffic patterns and identify anomalies.", "type": "recommendation", - "guid": "91d861ad-a4ae-462e-8775-56cbecb31951" + "guid": "803959bf-c8c1-41a4-9d4c-3e90d7d2bc06" }, { "waf": "Operations", "service": "App Service Web Apps", - "text": "(App Service) Take advantage of App Service managed certificates to offload certification management to Azure.", + "text": "(Web Apps) Take advantage of App Service-managed certificates to offload certification management to Azure.", "description": "App Service automatically handles processes like certificate procurement, certificate verification, certificate renewal, and importing certificates from Key Vault. Alternatively, upload your certificate to Key Vault and authorize the App Service resource provider to access it.", "type": "recommendation", - "guid": "70e24ecc-5113-4d7d-8a79-8ac24fe1f838" + "guid": "412a50c5-6222-4b89-a181-44fa60ed3095" }, { "waf": "Operations", "service": "App Service Web Apps", - "text": "(App Service plan) Validate app changes in the staging slot before you swap it with the production slot.", + "text": "(App Service) Validate app changes in the staging slot before you swap it with the production slot.", "description": "Avoid downtime and errors. Quickly revert to the last-known good state if you detect a problem after a swap.", "type": "recommendation", - "guid": "7a592a00-a940-4d2e-a919-56ad3237092e" + "guid": "eed80f2e-c4ea-4518-a715-f2e76b2f47ac" }, { "waf": "performance", @@ -428,10 +492,10 @@ { "waf": "performance", "service": "App Service Web Apps", - "text": "Select the right tier: Use dedicated compute for production workloads. Premium tiers offer larger SKUs with increased memory and CPU capacity, more instances, and more features, such as zone redundancy. For more information, see Premium V3 pricing tier.", + "text": "Choose the right tier: Use dedicated compute for production workloads. Premium v3 tiers provide larger SKUs with increased memory and CPU capacity, more instances, and more features, such as zone redundancy. For more information, see Premium v3 pricing tier.", "description": "", "type": "checklist", - "guid": "7e11019e-4329-458a-b84a-9233aa8de1a5" + "guid": "d470fdd5-8db4-41d1-8154-51ee76f130a2" }, { "waf": "performance", @@ -452,26 +516,26 @@ { "waf": "performance", "service": "App Service Web Apps", - "text": "Review the performance antipatterns: To make sure the web application performs and scales in accordance with your business requirements, avoid the typical antipatterns. Here are some antipatterns that App Service corrects.", + "text": "Review the performance antipatterns: To make sure that the web application performs and scales in accordance with your business requirements, avoid the typical antipatterns. The following table describes some antipatterns that App Service corrects.", "description": "", "type": "checklist", - "guid": "c8094d1c-6371-4984-98c1-ca5d72a7732d" + "guid": "7a330ca2-886d-4a2f-8ba6-713022bf4158" }, { "waf": "Performance", "service": "App Service Web Apps", - "text": "Enable the Always On setting when applications share a single App Service plan. App Service apps automatically unload when idle to save resources. The next request triggers a cold start, which can cause request timeouts.", + "text": "(App Service) Enable the Always On setting when applications share a single App Service plan. App Service apps automatically unload when idle to save resources. The next request triggers a cold start, which can cause request time-outs.", "description": "The application is never unloaded with Always On enabled.", "type": "recommendation", - "guid": "2c97ff90-e7cd-4888-a0c9-c21ea287071c" + "guid": "1e08dd2e-2b85-4feb-8e25-129b08d4575c" }, { "waf": "Performance", "service": "App Service Web Apps", - "text": "Consider using HTTP/2 for applications to improve protocol efficiency.", + "text": "(Web Apps) Consider using HTTP/2 for applications to improve protocol efficiency.", "description": "Choose HTTP/2 over HTTP/1.1 because HTTP/2 fully multiplexes connections, reuses connections to reduce overhead, and compresses headers to minimize data transfer.", "type": "recommendation", - "guid": "d735fb9e-9e87-415c-93d0-4918b5cf104e" + "guid": "21d87d8b-b32e-4701-b588-cf5e1cc5dfe5" }, { "waf": "reliability", @@ -789,7 +853,7 @@ "waf": "Performance", "service": "Azure Application Gateway", "text": "Set the minimum instance count to an optimal level based on you estimated instance count, actual Application Gateway autoscaling trends, and your application patterns. Check the current compute units for the past month. This metric represents the gateway's CPU usage. To define the minimum instance count, divide the peak usage by 10. For example, if your average current compute units in the past month is 50, set the minimum instance count to five.", - "description": "For Application Gateway v2, autoscaling takes approximately six to seven minutes before the extra set of instances are ready to serve traffic. During that time, if Application Gateway has short spikes in traffic, expect transient latency or loss of traffic.", + "description": "For Application Gateway v2, autoscaling takes approximately three to five minutes before the extra set of instances are ready to serve traffic. During that time, if Application Gateway has short spikes in traffic, expect transient latency or loss of traffic.", "type": "recommendation", "guid": "b556535f-178c-4d6f-a2eb-be758dfd24da" }, @@ -889,6 +953,14 @@ "type": "recommendation", "guid": "6ba14c6a-6f37-46a3-a2a8-e38373925d6f" }, + { + "waf": "Reliability", + "service": "Azure Blob Storage", + "text": "Configure vaulted backup for Azure Blob as a part of your backup strategy.", + "description": "Vaulted backup enables you to protect the block blob data from ransomware, other malicious attacks, or source data loss. The data is copied and stored in the Backup vault (an offsite copy of data) that can be retained for up to 10 years. If any data loss happens on the source account, you can trigger a restore to an alternate account and get access to your data. Learn more about the supportability for vaulted backup using Azure Backup.", + "type": "recommendation", + "guid": "4a608800-9b96-4468-98eb-d1cb77d29341" + }, { "waf": "security", "service": "Azure Blob Storage", @@ -1324,410 +1396,586 @@ { "waf": "reliability", "service": "Azure Expressroute", - "text": "Select between ExpressRoute circuit or ExpressRoute Direct for business requirements.", + "text": "Build redundancy, strengthen resiliency: Eliminate single points of failure as much as practical. Plan for redundancy in the network design by configuring multiple ExpressRoute circuits, diverse paths, and multiple peering locations closest to your on-premises locations.", + "description": "", + "type": "checklist", + "guid": "eaa938e3-df12-4f13-9be2-0840710f4140" + }, + { + "waf": "reliability", + "service": "Azure Expressroute", + "text": "Anticipate potential failures: Plan mitigation strategies for potential failures. The following table shows examples of failure mode analysis.", + "description": "", + "type": "checklist", + "guid": "a67a6739-c1af-48d6-a3d1-001fb5105139" + }, + { + "waf": "reliability", + "service": "Azure Expressroute", + "text": "Plan for site resiliency: Planning for site resiliency is crucial to ensure high availability. ExpressRoute offers three architectures of site resiliency: Standard, High, and Maximum. Standard resiliency provides basic protection against link failures, but does not provide protection against site failures. High resiliency offers enhanced protection with additional failover mechanisms, and Maximum resiliency ensures the highest level of protection with multiple redundant systems and failover mechanisms.", + "description": "", + "type": "checklist", + "guid": "97a5c131-c9fc-4df7-8e40-712cff4c0347" + }, + { + "waf": "reliability", + "service": "Azure Expressroute", + "text": "Plan for regions and availability zones: Plan for multiple region and availability zones closest to your on-premises locations to provide resiliency and high availability.", + "description": "", + "type": "checklist", + "guid": "464ff844-fbfb-4b54-a187-a0e16fe057a1" + }, + { + "waf": "reliability", + "service": "Azure Expressroute", + "text": "Plan for ExpressRoute circuit or ExpressRoute Direct: During the initial planning phase, you want to decide whether you want to configure an ExpressRoute circuit or an ExpressRoute Direct connection.", + "description": "", + "type": "checklist", + "guid": "120c787b-2ff9-46be-8659-86d8258ebbdf" + }, + { + "waf": "reliability", + "service": "Azure Expressroute", + "text": "Choose the right circuit SKU: ExpressRoute circuit SKUs provide redundancy through the use of geographic expansion. ExpressRoute have three SKUs: Local, Standard, and Premium.", + "description": "", + "type": "checklist", + "guid": "90dba368-7b38-40fd-9055-fddc57fa5905" + }, + { + "waf": "reliability", + "service": "Azure Expressroute", + "text": "Plan for Active-Active connectivity: ExpressRoute dedicated circuits provide availability when an active-active connectivity is configured between on-premises and Azure. This configuration provides higher availability of your ExpressRoute connection.", + "description": "", + "type": "checklist", + "guid": "ac47e28b-a726-486c-84a0-3737328be716" + }, + { + "waf": "reliability", + "service": "Azure Expressroute", + "text": "Plan for geo-redundant circuits: Configure ExpressRoute circuits in more than one peering location to ensure that there are multiple, geographically diverse paths between on-premises networks and Azure. This reduces the risk of a single point of failure causing a network outage, thereby increasing the reliability and availability of the connection.", + "description": "", + "type": "checklist", + "guid": "05e0c588-4705-4742-a85e-90809becc049" + }, + { + "waf": "reliability", + "service": "Azure Expressroute", + "text": "Configure ExpressRoute Global Reach: As an ExpressRoute circuit Premium SKU feature, ExpressRoute Global Reach allows you to link your on-premises networks across different geographical locations directly through the Azure backbone network. By connecting your on-premises networks to multiple Azure regions, Global Reach provides an additional layer of redundancy. If one Azure region becomes unavailable, you can quickly reroute traffic to another region without relying on the public internet, maintaining secure and reliable connectivity.", "description": "", "type": "checklist", - "guid": "473b3683-6d53-4521-89c4-d8aa1d1df633" + "guid": "674dde16-ad5a-442d-9154-b06b5d86452a" }, { "waf": "reliability", "service": "Azure Expressroute", - "text": "Configure ExpressRoute circuits with Maximum or High Resiliency for production workloads.", + "text": "Configure site-to-site VPN as a backup to ExpressRoute private peering: This configuration provides an additional layer of redundancy and ensures that your network remains operational even if the ExpressRoute connection experiences an outage.", "description": "", "type": "checklist", - "guid": "e58c1767-6db4-4b40-a26e-1ab8967517f4" + "guid": "eb749fa2-f7db-4c21-b578-b38bb05dc4ff" }, { "waf": "reliability", "service": "Azure Expressroute", - "text": "Configure Active-Active ExpressRoute connections between on-premises and Azure.", + "text": "Plan for Virtual Network Gateways: When selecting and configuring your ExpressRoute Virtual Network Gateway for resiliency, consider the following best practices:", "description": "", "type": "checklist", - "guid": "b356e60e-cb41-4ee6-a8d5-290b429619f7" + "guid": "de1db633-02f1-40a0-b67b-f2f15647b05c" }, { "waf": "reliability", "service": "Azure Expressroute", - "text": "Set up availability zone aware ExpressRoute Virtual Network Gateways.", + "text": "Plan for service providers: Choose different service providers for each circuit to ensure diverse paths. This diversity in service providers minimizes the risk of network downtime due to a single provider's outage.", "description": "", "type": "checklist", - "guid": "877b9a2d-8171-441b-ba7f-b8c6191f12bc" + "guid": "8da8c5b5-e2d6-4379-9773-8464a55c185d" }, { "waf": "reliability", "service": "Azure Expressroute", - "text": "Configure ExpressRoute Virtual Network Gateways in different regions.", + "text": "Conduct reliability testing: Test the network design for resiliency to ensure that the network can withstand failures. Testing can be achieved by using Azure Connectivity Toolkit to test performance across your ExpressRoute circuit to understand bandwidth capacity and latency of your network connection. Confirm failover mechanisms are working as expected.", "description": "", "type": "checklist", - "guid": "64ad6a67-7f17-4d55-a365-0ec8716fb135" + "guid": "2db2b6fa-5f0b-47b2-a637-eafcceebba53" }, { "waf": "reliability", "service": "Azure Expressroute", - "text": "Configure site-to-site VPN as a backup to ExpressRoute private peering.", + "text": "Configure monitoring for ExpressRoute circuits and ExpressRoute Virtual Network Gateway health: Configure monitoring and alerts for ExpressRoute circuit and ExpressRoute Virtual Network Gateway health based on various metrics available.", "description": "", "type": "checklist", - "guid": "20bbf3a5-e3d8-42eb-9b88-9c4a811a483a" + "guid": "f8b8370b-fe3a-444c-8e3d-508a8f373e35" }, { "waf": "reliability", "service": "Azure Expressroute", - "text": "Configure service health to receive ExpressRoute circuit maintenance notifications.", + "text": "Use health indicators to identify disruptions: Configure monitoring and alerts for ExpressRoute circuit and ExpressRoute Virtual Network Gateway health based on various metrics available.", "description": "", "type": "checklist", - "guid": "da08260d-363c-4fcb-a555-ed4448d0be3a" + "guid": "61936148-0465-4542-9f13-69305a4635e4" + }, + { + "waf": "Reliability", + "service": "Azure Expressroute", + "text": "Anticipate and mitigate potential failures when you design and architect Azure ExpressRoute.", + "description": "Anticipating failures leads to the design of a more robust and resilient network architecture that can withstand various failure scenarios.", + "type": "recommendation", + "guid": "367f99a6-30e8-40e4-b76e-f8f55538c59b" + }, + { + "waf": "Reliability", + "service": "Azure Expressroute", + "text": "Plan for site resiliency. For Maximum or High resiliency, plan to have multiple paths between the on-premises edge and the peering locations (provider/Microsoft edge locations). For Maximum Resiliency, configure multiple circuits to different peering locations. For High Resiliency, configure a circuit between multiple peering locations within the same metropolitan area (also referred to as ExpressRoute Metro) from the on-premises network.", + "description": "By having multiple paths between the on-premises edge and the peering locations, the network can continue to operate even if one path fails. This redundancy is crucial for maintaining continuous connectivity and minimizing downtime.", + "type": "recommendation", + "guid": "009e9cf8-7610-4ab0-a4fc-3d0b8e14d1a3" + }, + { + "waf": "Reliability", + "service": "Azure Expressroute", + "text": "Plan for multiple region and availability zones.", + "description": "Availability zones are physically separate locations within a region, providing fault isolation. This means that failures in one zone don't affect the others, enhancing overall system reliability.", + "type": "recommendation", + "guid": "1fba669c-f42f-4a0c-a9d4-c278bb077ee4" + }, + { + "waf": "Reliability", + "service": "Azure Expressroute", + "text": "Plan for ExpressRoute circuit or ExpressRoute Direct. During the initial planning phase, you want to decide whether you want to configure an ExpressRoute circuit or an ExpressRoute Direct connection. You also need to identify the bandwidth requirement and the SKU type requirement for your business needs.", + "description": "An ExpressRoute circuit allows a private dedicated connection into Azure with the help of a connectivity provider. ExpressRoute Direct allows you to extend on-premises network directly into the Microsoft network at a peering location.", + "type": "recommendation", + "guid": "edb0633f-0ce0-4f1b-908f-1ce1fb24fc2b" }, { "waf": "Reliability", "service": "Azure Expressroute", - "text": "Plan for ExpressRoute circuit or ExpressRoute Direct", - "description": "During the initial planning phase, you want to decide whether you want to configure an ExpressRoute circuit or an ExpressRoute Direct connection. An ExpressRoute circuit allows a private dedicated connection into Azure with the help of a connectivity provider. ExpressRoute Direct allows you to extend the on-premises network directly into the Microsoft network at a peering location. You also need to identify the bandwidth requirement and the SKU type requirement for your business needs.", + "text": "Choose the right circuit SKU for redundancy by using geographic expansion. The Local, Standard, and Premium SKUs offer different levels of connectivity, access, and performance capabilities. Premium SKU provides the highest level of redundancy with global connectivity to any Azure region worldwide.", + "description": "Choosing the right circuit SKU ensures that you have the appropriate level of redundancy and connectivity for your workloads.", "type": "recommendation", - "guid": "09e0dd1a-b1f7-46c3-8df1-48e841f53dca" + "guid": "83d0356a-e28a-4e3d-bbb7-8c98c20fa003" }, { "waf": "Reliability", "service": "Azure Expressroute", - "text": "Plan for geo-redundant circuits", - "description": "To plan for disaster recovery, set up ExpressRoute circuits in more than one peering locations. You can create circuits in peering locations in the same metro or different metro and choose to work with different service providers for diverse paths through each circuit. For more information, see Designing for disaster recovery and Designing for high availability.", + "text": "Plan for Active-Active connectivity. To improve high availability, redundancy, and resiliency, we recommend operating both connections of an ExpressRoute circuit in active-active mode. Additionally, configure Bi-Directional Forwarding Detection (BFD) over both private and Microsoft Peering for faster failover during a link failure.", + "description": "Active-active mode mode provides higher availability of your ExpressRoute connections. BFD provides rapid detection of link failures, enabling quicker failover to backup paths. This minimizes downtime and ensures continuous connectivity.", "type": "recommendation", - "guid": "257031a8-f034-436c-9f54-e82aab53c559" + "guid": "2262ac47-3edb-4d69-bd13-7222769f04fb" }, { "waf": "Reliability", "service": "Azure Expressroute", - "text": "Plan for Active-Active connectivity", - "description": "This mode provides higher availability of your Expressroute connections. It's also recommended to configure BFD for faster failover if there's a link failure on a connection.", + "text": "Plan for geo-redundant circuits.", + "description": "There are scenarios where an ExpressRoute peering location or an entire regional service might experience degradation. Geo-redundancy enhances disaster recovery and high availability by ensuring that there are multiple, geographically diverse paths between on-premises networks and Azure. This reduces the risk of a single point of failure causing a network outage, thereby increasing the reliability and availability of the connection.", "type": "recommendation", - "guid": "068037d8-673f-4e86-bc9d-bf83fbe61d12" + "guid": "b03be81e-a512-4b83-b9cd-9fc6535db6fb" }, { "waf": "Reliability", "service": "Azure Expressroute", - "text": "Planning for Virtual Network Gateways", - "description": "Create availability zone aware Virtual Network Gateway for higher resiliency and plan for Virtual Network Gateways in different regions for resiliency, disaster recovery, and high availability.", + "text": "With ExpressRoute Global Reach you can link ExpressRoute circuits together to make a private network between your on-premises networks. Configure ExpressRoute Global Reach on your ExpressRoute circuit Premium SKU.", + "description": "ExpressRoute Global Reach provides an additional layer of redundancy by linking your on-premises networks across different geographical locations directly through the Azure backbone network. This ensures that your network remains connected and operational even if one Azure region becomes unavailable.", "type": "recommendation", - "guid": "21f65e89-ffe2-489f-89f2-16cbc2e257d9" + "guid": "b752e634-ea09-4964-a647-5452f21d8166" }, { "waf": "Reliability", "service": "Azure Expressroute", - "text": "Monitor circuits and gateway health", - "description": "Set up monitoring and alerts for ExpressRoute circuits and Virtual Network Gateway health based on various metrics available.", + "text": "Choose different ExpressRoute service providers for each circuit.", + "description": "Diversity in service providers minimizes the risk of network downtime due to a single provider's outage. By choosing different service providers for each circuit, you can ensure that your network remains operational even if one provider experiences an outage. This redundancy is essential for maintaining continuous connectivity and minimizing downtime.", "type": "recommendation", - "guid": "0f875bf3-de86-41b5-80d2-477de2f769a2" + "guid": "68a204a8-0af6-49fe-ad86-96e5946addbe" }, { "waf": "Reliability", "service": "Azure Expressroute", - "text": "Enable service health", - "description": "ExpressRoute uses service health to notify about planned and unplanned maintenance. Configuring service health will notify you about changes made to your ExpressRoute circuits.", + "text": "Configure Site-to-Site VPN over Microsoft peering as a backup to ExpressRoute private peering. Site-to-site VPN provides an additional layer of redundancy and ensures that your network remains operational even if the ExpressRoute connection experiences an outage.", + "description": "By configuring a site-to-site VPN as a backup to ExpressRoute private peering, you can maintain continuous connectivity and minimize downtime.", "type": "recommendation", - "guid": "c84ca8b2-74f8-4d25-8fc3-5b30c9969b5f" + "guid": "f72d4670-8956-41b7-aeb6-adbff0857880" + }, + { + "waf": "Reliability", + "service": "Azure Expressroute", + "text": "Planning for zone-redundant Virtual Network Gateways. Select the right ExpressRoute Virtual Network Gateway SKU to reflect the correct performance and throughput for your business. Consider deploying a scalable virtual network gateway that allows you to achieve 40-Gbps connectivity and will auto-scale based on your required throughput. Deploy ExpressRoute virtual network gateways that are zone-redundant for maximum resiliency and redundancy across Availability Zones.", + "description": "Choosing the appropriate SKU ensures that the gateway can handle the required performance and throughput for your business needs. A scalable virtual network gateway autoscales based on required throughput, allowing the network to adapt to changing demands. This flexibility helps maintain performance during peak usage times and prevents overloading. Additionally, deploying zone-redundant virtual network gateways ensures that the network remains operational even if one availability zone experiences an outage, enhancing overall reliability and resiliency.", + "type": "recommendation", + "guid": "8e7427c7-6385-48c7-b144-345f90cdf217" + }, + { + "waf": "Reliability", + "service": "Azure Expressroute", + "text": "Conduct reliability testing with the Azure Connectivity Toolkit to ensure that the network design is resilient and can withstand failures.", + "description": "Reliability testing helps identify potential issues and weaknesses in the network design, allowing you to address them proactively. By conducting reliability testing, you can ensure that the network is robust and resilient, minimizing downtime and ensuring continuous connectivity.", + "type": "recommendation", + "guid": "bf9a6d1a-bc20-4f68-b073-7da6c756b389" + }, + { + "waf": "Reliability", + "service": "Azure Expressroute", + "text": "Configure monitoring and alerts for ExpressRoute circuits, peering, ports, and Virtual Network Gateway resource health based on various available metrics. This helps in proactively managing and maintaining the health of your network. Use Network Insights for ExpressRoute to visualize topological maps and health dashboards, providing a clear view of your configurations and their status.", + "description": "By setting up monitoring and alerts based on various metrics, you can proactively detect and address issues such as increased latency, traffic drops, or circuit downtimes before they impact your services.", + "type": "recommendation", + "guid": "472a2de0-6895-4d96-88aa-b4192e8101a5" + }, + { + "waf": "Reliability", + "service": "Azure Expressroute", + "text": "Configure service health to notify you about planned and unplanned maintenance. Configuring service health notifies you about changes made to your ExpressRoute circuits.", + "description": "With Service Health, you can view planned and past maintenance in the Azure portal along with configuring alerts and notifications that best suits your needs.", + "type": "recommendation", + "guid": "eabe2ca9-baee-4fa8-9fc8-f669088805ea" }, { "waf": "security", "service": "Azure Expressroute", - "text": "Configure Activity log to send logs to archive.", + "text": "Leverage Azure Security Baseline for ExpressRoute: The Microsoft cloud security benchmark provides recommendations on how you can secure your cloud solutions on Azure.", "description": "", "type": "checklist", - "guid": "ca1c5676-1b0a-426e-baaf-da74ab806cb4" + "guid": "a82da696-7b8c-4ce0-b54c-d9335a4c0ec3" }, { "waf": "security", "service": "Azure Expressroute", - "text": "Maintain an inventory of administrative accounts with access to ExpressRoute resources.", + "text": "Implement Azure Role-Based Access Control (RBAC): Use Azure RBAC to configure roles to limit user accounts that can add, update, or delete peering configurations on an ExpressRoute circuit or change ExpressRoute resources.", "description": "", "type": "checklist", - "guid": "ef124dcd-17e6-4b4e-9bdd-511ef1959a05" + "guid": "c0ba0d1f-8e31-4f22-9555-edbbb29178c2" }, { "waf": "security", "service": "Azure Expressroute", - "text": "Configure MD5 hash on ExpressRoute circuit.", + "text": "Configure ExpressRoute encryption: Encrypt data in transit over ExpressRoute circuits to ensure that data transmitted between on-premises networks and Azure virtual networks is secure and protected from unauthorized access. ExpressRoute supports the following encryption options:", "description": "", "type": "checklist", - "guid": "23fbb2f6-a269-4fb5-a3a0-04aae0516c91" + "guid": "c60df2ae-4d84-4ebe-83f9-98fd599e11fb" }, { "waf": "security", "service": "Azure Expressroute", - "text": "Configure MACSec for ExpressRoute Direct resources.", + "text": "Configure MD5 hash on ExpressRoute circuit: During configuration of private peering or Microsoft peering, apply an MD5 hash to secure messages between the on-premises router and the MSEE routers.", "description": "", "type": "checklist", - "guid": "8e02b876-d810-498b-b9a5-e50730fb10d6" + "guid": "5b4f3423-1ad5-40d8-9183-02adbda43f0c" }, { "waf": "security", "service": "Azure Expressroute", - "text": "Encrypt traffic over private peering and Microsoft peering for virtual network traffic.", + "text": "Configure Activity log to send logs to archive: Activity logs are essential for auditing, compliance, incident response, operational visibility, and policy enforcement for ExpressRoute. Configure Activity log to send logs to an archive for long-term retention and analysis.", "description": "", "type": "checklist", - "guid": "946a3e3d-bbaf-4b4a-ab80-2ef0a4631f30" + "guid": "c8d9867b-4f88-470f-8f8e-8a09d80a8f0d" }, { "waf": "Security", "service": "Azure Expressroute", - "text": "Configure Activity log to send logs to archive", - "description": "Activity logs provide insights into operations that were performed at the subscription level for ExpressRoute resources. With Activity logs, you can determine who and when an operation was performed at the control plane. Data retention is only 90 days and required to be stored in Log Analytics, Event Hubs or a storage account for archive.", + "text": "Leverage Azure Security Baseline for ExpressRoute. This security baseline applies guidance from the Microsoft Cloud Security Benchmark version 1.0 to ExpressRoute.", + "description": "The content is organized by the security controls defined in the benchmark and includes related guidance specific to ExpressRoute.", "type": "recommendation", - "guid": "9631d7ef-657c-4b07-9c75-96b2dcc5c5d2" + "guid": "ef173deb-b58f-47c2-b4a1-f11d49404249" }, { "waf": "Security", "service": "Azure Expressroute", - "text": "Maintain inventory of administrative accounts", - "description": "Use Azure RBAC to configure roles to limit user accounts that can add, update, or delete peering configuration on an ExpressRoute circuit.", + "text": "Implement Azure Role-Based Access Control (RBAC) to control who can manage ExpressRoute resources such as ExpressRoute circuits and gateways.", + "description": "By providing granular access management to resources, you can maintain an inventory of administrative accounts with access to ExpressRoute resources and ensure that only authorized users can perform specific actions.", "type": "recommendation", - "guid": "42b91c75-909f-4366-b014-48ab48639faf" + "guid": "924b3ac3-1327-41b6-9055-ffc14eb2cb43" }, { "waf": "Security", "service": "Azure Expressroute", - "text": "Configure MD5 hash on ExpressRoute circuit", - "description": "During configuration of private peering or Microsoft peering, apply an MD5 hash to secure messages between the on-premises route and the MSEE routers.", + "text": "Configure MACsec for ExpressRoute Direct ports.", + "description": "MACsec (Media Access Control security) enhances security by encrypting data, ensuring data integrity, protecting vulnerable protocols. It secures protocols that are typically not protected on Ethernet links, such as ARP, DHCP, and LACP, thereby preventing potential security threats targeting these protocols.", "type": "recommendation", - "guid": "78f7d298-53bf-49ae-8ed7-994d46ccf2dd" + "guid": "d4e1f1bd-f019-4b5f-9a33-5a6a455e9648" }, { "waf": "Security", "service": "Azure Expressroute", - "text": "Configure MACSec for ExpressRoute Direct resources", - "description": "Media Access Control security is a point-to-point security at the data link layer. ExpressRoute Direct supports configuring MACSec to prevent security threats to protocols such as ARP, DHCP, LACP not normally secured on the Ethernet link. For more information on how to configure MACSec, see MACSec for ExpressRoute Direct ports.", + "text": "Encrypt traffic using IPsec (Internet Protocol Security) for ExpressRoute private peering or configure a tunnel using private peering.", + "description": "IPsec encrypts data at the network layer (Layer 3) and enhances security by providing encryption, authentication, integrity protection, and compliance. This ensures that data transmitted over ExpressRoute circuits is secure and protected from unauthorized access and tampering.", "type": "recommendation", - "guid": "d495a140-702d-4e08-bb86-7ceac8141df2" + "guid": "c2df5102-5fa1-45e5-9d29-8e3e1f69f2ae" }, { "waf": "Security", "service": "Azure Expressroute", - "text": "Encrypt traffic using IPsec", - "description": "Configure a Site-to-site VPN tunnel over your ExpressRoute circuit to encrypt data transferring between your on-premises network and Azure virtual network. You can configure a tunnel using private peering or using Microsoft peering.", + "text": "Configure MD5 hash on ExpressRoute circuit during configuration of private peering or Microsoft peering to secure messages between the on-premises route and the MSEE routers.", + "description": "By generating an MD5 hash of the data before transmission and comparing it with the hash generated after reception, you can ensure that the data hasn't been tampered with during transit.", "type": "recommendation", - "guid": "7729c230-dbdf-4aec-9295-fcb0a0c365f2" + "guid": "aa98e684-2f18-48ff-9436-79797e7ef088" + }, + { + "waf": "Security", + "service": "Azure Expressroute", + "text": "Configure activity logs and send logs an to archive. Data retention is only 90 days and required to be stored in Log Analytics, Event Hubs or a storage account for archive. For more information about Activity logs in ExpressRoute, see Monitor Azure ExpressRoute.", + "description": "Activity logs provide insights into operations that were performed at the subscription level for ExpressRoute resources. With Activity logs, you can determine who and when an operation was performed at the control plane.", + "type": "recommendation", + "guid": "4112e9d9-4ad2-4d86-bc8b-a03a142dae94" }, { "waf": "cost", "service": "Azure Expressroute", - "text": "Familiarize yourself with ExpressRoute pricing.", + "text": "Familiarize yourself with ExpressRoute pricing: As part of your cost model exercise, estimate the cost of ExpressRoute. Ensure that the options are adequately sized to meet the capacity demand and deliver expected performance without wasting resources.", "description": "", "type": "checklist", - "guid": "43d6df90-c15b-494c-8d35-c4fc9180fbdb" + "guid": "440aab78-fe9c-429e-bf1a-d2cc426e8ed1" }, { "waf": "cost", "service": "Azure Expressroute", - "text": "Determine the ExpressRoute circuit SKU and bandwidth required.", + "text": "Determine circuit SKU and bandwidth required: Base your selection of ExpressRoute circuit and virtual network gateway SKU and bandwidth on the capacity demand and performance requirements of your workload.", "description": "", "type": "checklist", - "guid": "92065590-2f1a-4a81-a6a6-2b102f66f9e3" + "guid": "3673ac26-dbbb-47be-8a94-e50af82db1ec" }, { "waf": "cost", "service": "Azure Expressroute", - "text": "Determine the ExpressRoute virtual network gateway size required.", + "text": "Determine the ExpressRoute virtual network gateway size: Choose the right size for your ExpressRoute virtual network gateway based on the capacity demand and performance requirements of your workload.", "description": "", "type": "checklist", - "guid": "f6e0770f-fa13-450e-8c81-baf51ba1b550" + "guid": "88664a21-12be-440a-a242-31fa4921a25c" }, { "waf": "cost", "service": "Azure Expressroute", - "text": "Monitor cost and create budget alerts.", + "text": "Monitor cost and create budget alerts: Monitor the cost of your ExpressRoute circuit and create alerts for spending anomalies and overspending risks.", "description": "", "type": "checklist", - "guid": "07d0ba21-7eef-47d1-8ba0-26fefa26c733" + "guid": "f5af3e24-84d4-4d38-b167-15491d333de6" }, { "waf": "cost", "service": "Azure Expressroute", - "text": "Deprovision ExpressRoute circuits no longer in use.", + "text": "Deprovision and delete unused ExpressRoute circuits: Azure Advisor can detect ExpressRoute circuits that have been deployed for a significant time but have a provider status of Not Provisioned.", "description": "", "type": "checklist", - "guid": "d7be65f4-6500-49ea-92e3-3121fca4a076" + "guid": "bf67027e-4fb3-4c20-a8bf-54389b234ce5" }, { "waf": "Cost", "service": "Azure Expressroute", - "text": "Familiarize yourself with ExpressRoute pricing", - "description": "For information about ExpressRoute pricing, see Understand pricing for Azure ExpressRoute. You can also use the Pricing calculator.Ensure that the options are adequately sized to meet the capacity demand and deliver expected performance without wasting resources.", + "text": "Familiarize yourself with ExpressRoute pricing. Use the Azure Pricing Calculator to estimate the cost. ExpressRoute Direct has a monthly port fee that includes the circuit fee for Local and Standard SKU ExpressRoute circuits. For Premium SKU circuits, there's an additional circuit fee. Outbound data transfer is charged per GB used, depending on the zone number of the peering location. The outbound data charge applies only to Standard and Premium SKUs. For more information, see plan and manage costs for Azure ExpressRoute.", + "description": "Understanding ExpressRoute pricing enables better cost management, informed decision-making, avoidance of unexpected charges and maximization of value.", "type": "recommendation", - "guid": "f230ac81-7590-4300-9b9f-95d784e60ab2" + "guid": "c0c95335-56a8-4348-aa46-1d50bde8ce73" }, { "waf": "Cost", "service": "Azure Expressroute", - "text": "Determine SKU and bandwidth required", - "description": "The way you're charged for your ExpressRoute usage varies between the three different SKU types. With Local SKU, you're automatically charged with an Unlimited data plan. With Standard and Premium SKU, you can select between a Metered or an Unlimited data plan. All ingress data are free of charge except when using the Global Reach add-on. It's important to understand which SKU types and data plan works best for your workload to best optimize cost and budget. For more information resizing ExpressRoute circuit, see upgrading ExpressRoute circuit bandwidth.", + "text": "Determine circuit SKU and bandwidth required. The way you're charged for your ExpressRoute usage varies between the three different SKU types. With the Local SKU, you're automatically charged with an Unlimited data plan. With the Standard and Premium SKUs, you can choose between a Metered or an Unlimited data plan. All ingress data is free of charge, except when using the Global Reach add-on, which incurs additional costs for data transfer between different geographical locations. It's important to review and resize your ExpressRoute circuit.", + "description": "It's important to understand which SKU types and data plan works best for your workload to best optimize cost and budget.", "type": "recommendation", - "guid": "3d8a5d49-af34-431f-b47f-ee8cf05479b5" + "guid": "63d35ff5-ac94-4fb4-8a29-0b6ab55a06be" }, { "waf": "Cost", "service": "Azure Expressroute", - "text": "Determine the ExpressRoute virtual network gateway size", - "description": "ExpressRoute virtual network gateways are used to pass traffic into a virtual network over private peering. Review the performance and scale needs of your preferred Virtual Network Gateway SKU. Select the appropriate gateway SKU on your on-premises to Azure workload.", + "text": "Determine the size of the ExpressRoute Virtual Network Gateway. ExpressRoute virtual network gateways are used to pass traffic into a virtual network over private peering. Select the appropriate gateway SKU on your on-premises to Azure workload. Understand ExpressRoute Gateway pricing based on region and type. ExpressRoute Gateways are charged at an hourly rate plus the cost of an ExpressRoute circuit. Configure scalable ExpressRoute gateways to set minimum and maximum scale units for the gateway, which auto-scales based on active bandwidth or flow count. See ExpressRoute pricing and select ExpressRoute Gateways to see rates for different gateway SKUs.", + "description": "This benefits you by enabling right-sizing of resources, providing flexibility to scale, optimizing performance, and supporting proactive cost management. This approach ensures that you're using resources efficiently and cost-effectively.", "type": "recommendation", - "guid": "82224292-a5a6-4b85-9b2f-b617117c4285" + "guid": "4891572a-356b-4752-88ce-c344a980cab2" }, { "waf": "Cost", "service": "Azure Expressroute", - "text": "Monitor cost and create budget alerts", - "description": "Monitor the cost of your ExpressRoute circuit and create alerts for spending anomalies and overspending risks. For more information, see Monitoring ExpressRoute costs.", + "text": "Monitor costs and create budget alerts. Monitor the cost of your ExpressRoute circuit and create alerts for spending anomalies and overspending risks.", + "description": "Monitoring and alerts provide you with tools to control spending, enhance financial planning, ensure accountability, and optimize resource usage.", "type": "recommendation", - "guid": "e2e81918-e05e-49e1-a37c-cb65840c8699" + "guid": "095c589a-08db-4c98-9528-268babb2aecc" }, { "waf": "Cost", "service": "Azure Expressroute", - "text": "Deprovision and delete ExpressRoute circuits no longer in use.", - "description": "ExpressRoute circuits are charged from the moment they're created. To reduce unnecessary cost, deprovision the circuit with the service provider and delete the ExpressRoute circuit from your subscription. For steps on how to remove an ExpressRoute circuit, see Deprovisioning an ExpressRoute circuit.", + "text": "Deprovision and delete unused ExpressRoute circuits. Azure Advisor can detect ExpressRoute circuits that have been deployed for a significant time but have a provider status of Not Provisioned.", + "description": "ExpressRoute circuits are charged from the moment they're created. To reduce unnecessary cost, deprovision the circuit with the service provider and delete the ExpressRoute circuit from your subscription.", "type": "recommendation", - "guid": "131104a1-a17f-4b6b-9384-0b636a5d5265" + "guid": "1b38da30-523c-4dc6-8202-02dbb66f94e0" + }, + { + "waf": "operations", + "service": "Azure Expressroute", + "text": "Choose the closest peering locations: Choose the closest peering locations to your on-premises network to reduce latency and costs.", + "description": "", + "type": "checklist", + "guid": "c04e4549-45ba-41e5-b072-af60fbfde9b5" }, { "waf": "operations", "service": "Azure Expressroute", - "text": "Configure connection monitoring between your on-premises and Azure network.", + "text": "Configure connection monitoring between your on-premises and Azure network: Use Connection Monitor to monitor connectivity between your on-premises resources and Azure over the ExpressRoute private peering and Microsoft peering connection.", "description": "", "type": "checklist", - "guid": "4c7d0c83-02a0-4535-a378-c2ab4c13469c" + "guid": "99151540-adc8-4532-ad9f-f18b5ae0cbe3" }, { "waf": "operations", "service": "Azure Expressroute", - "text": "Configure Service Health for receiving notification.", + "text": "Configure dynamic routing for your Microsoft peering enabled ExpressRoute circuit: Dynamic routing for ExpressRoute leverages BGP to provide automatic route updates, optimal path selection, scalability, and interoperability for your network.", "description": "", "type": "checklist", - "guid": "69c4fb71-4d2c-4534-a4db-5e3146a31e1d" + "guid": "3eb03dcb-410e-4ab1-9a43-051360133ae8" }, { "waf": "operations", "service": "Azure Expressroute", - "text": "Review metrics and dashboards available through ExpressRoute Insights using Network Insights.", + "text": "Configure Service Health for receiving notification: Configure Service Health notifications to alert you when planned and upcoming maintenance is happening to all ExpressRoute circuits in your subscription. For more information on how to integrate with the overall health model for your workload, see Health modeling for workloads.", "description": "", "type": "checklist", - "guid": "33b4fca5-1f90-4947-8091-6c23aba0651a" + "guid": "48305ec4-496e-4635-b6d0-617af80d8300" }, { "waf": "operations", "service": "Azure Expressroute", - "text": "Review ExpressRoute resource metrics.", + "text": "Configure Traffic Collector for ExpressRoute: ExpressRoute Traffic Collector enables the sampling of network flows over your ExpressRoute circuits.", "description": "", "type": "checklist", - "guid": "638c050d-7555-4575-bb8d-a4f2b613fa87" + "guid": "4407b332-26de-4333-b8bb-81cbbd3efcda" + }, + { + "waf": "operations", + "service": "Azure Expressroute", + "text": "Collect, analyze, and visualize metrics and logs: Collect metrics and logs as part of the overall monitoring strategy of your solution. Set alerts to proactively notify you when a certain threshold is met. Review metrics and dashboards available through ExpressRoute Insights to view details of your peering components all in a single place.", + "description": "", + "type": "checklist", + "guid": "bf07d7c1-88dc-47c6-8660-97ff9170758a" }, { "waf": "Operations", "service": "Azure Expressroute", - "text": "Configure connection monitoring", - "description": "Connection monitoring allows you to monitor connectivity between your on-premises resources and Azure over the ExpressRoute private peering and Microsoft peering connection. Connection monitor can detect networking issues by identifying where along the network path the problem is and help you quickly resolve configuration or hardware failures.", + "text": "Choose the closest peering locations to your on-premises network to reduce latency and costs.", + "description": "By choosing the closest peering location to your on-premises network, you can reduce latency and costs, ensuring optimal performance and cost-effectiveness.", "type": "recommendation", - "guid": "c6766a4e-7531-4335-af44-4fd1a3c706f4" + "guid": "33a2f571-80f6-4e66-973b-2939a43968d5" }, { "waf": "Operations", "service": "Azure Expressroute", - "text": "Configure Service Health", - "description": "Set up Service Health notifications to alert when planned and upcoming maintenance is happening to all ExpressRoute circuits in your subscription. Service Health also displays past maintenance along with RCA if an unplanned maintenance were to occur.", + "text": "Configure Connection Monitor between your on-premises and Azure network.", + "description": "Connection Monitor can detect networking issues by identifying where along the network path the problem is and help you quickly resolve configuration or hardware failures. Connection Monitor is part of Azure Monitor logs.", "type": "recommendation", - "guid": "5ff3a7b5-974a-466d-ab01-ad90c143969d" + "guid": "4fd1541c-e50d-457f-8b2d-d1bf40ad548f" }, { "waf": "Operations", "service": "Azure Expressroute", - "text": "Review metrics with Network Insights", - "description": "ExpressRoute Insights with Network Insights allow you to review and analyze ExpressRoute circuits, gateways, connections metrics and health dashboards. ExpressRoute Insights also provide a topology view of your ExpressRoute connections where you can view details of your peering components all in a single place.Metrics available:- Availability- Throughput- Gateway metrics", + "text": "Configure dynamic routing your Microsoft peering enabled ExpressRoute circuit.", + "description": "Dynamic routing allows for more efficient and flexible routing, ensuring optimal path selection and automatic updates to routing tables in response to network changes.", "type": "recommendation", - "guid": "210546e8-29e3-40d9-869f-6236fddaadd0" + "guid": "fbe58957-7337-4213-a742-f42f0816e979" }, { "waf": "Operations", "service": "Azure Expressroute", - "text": "Review ExpressRoute resource metrics", - "description": "ExpressRoute uses Azure Monitor to collect metrics and create alerts base on your configuration. Metrics are collected for ExpressRoute circuits, ExpressRoute gateways, ExpressRoute gateway connections, and ExpressRoute Direct. These metrics are useful for diagnosing connectivity problems and understanding the performance of your ExpressRoute connection.", + "text": "Configure Service Health notifications to alert you when planned and upcoming maintenance is scheduled for all ExpressRoute circuits in your subscription. Service Health also displays past maintenance events along with Root Cause Analysis (RCA) if unplanned maintenance event occurs.", + "description": "Service Health notifications provide timely alerts about planned and unplanned maintenance, outages, and early warnings about potential issues. This allows you to stay informed about the status of your ExpressRoute circuits.", "type": "recommendation", - "guid": "8031ed87-7573-469a-9b05-01f4ff4d9231" + "guid": "6ef006d5-2e0a-44c2-a39d-ad421414722d" }, { - "waf": "performance", + "waf": "Operations", "service": "Azure Expressroute", - "text": "Test ExpressRoute gateway performance to meet work load requirements.", - "description": "", - "type": "checklist", - "guid": "256753af-fb4b-49b2-a965-4b65265ee8dd" + "text": "Configure Traffic Collector for ExpressRoute", + "description": "ExpressRoute Traffic Collector enables the sampling of network flows over your ExpressRoute circuits. It supports both Private peering and Microsoft peering, providing near real-time visibility into network throughput and performance.", + "type": "recommendation", + "guid": "2b3a7f9b-630d-469f-b5aa-01e90b6035ae" }, { - "waf": "performance", + "waf": "Operations", "service": "Azure Expressroute", - "text": "Increase the size of the ExpressRoute gateway.", - "description": "", - "type": "checklist", - "guid": "9bc85bda-be71-4df0-924c-2604ef7f05fa" + "text": "Review metrics with Network Insights. ExpressRoute Insights with Network Insights allow you to review and analyze ExpressRoute circuits, gateways, connections metrics and health dashboards. ExpressRoute Insights also provide a topology view of your ExpressRoute connections where you can view details of your peering components all in a single place.", + "description": "Network Insights offers a centralized platform to monitor various metrics across ExpressRoute circuits, gateways, and connections, providing a comprehensive view of network health and performance.", + "type": "recommendation", + "guid": "cf0d1953-6ee0-40d2-b4c9-5b332e4ecbb8" + }, + { + "waf": "Operations", + "service": "Azure Expressroute", + "text": "Review ExpressRoute resource metrics. Use Azure Monitor to collect metrics and create alerts based on your configuration.", + "description": "Metrics are collected for ExpressRoute circuits, ExpressRoute gateways, ExpressRoute gateway connections, and ExpressRoute Direct. These metrics are useful for diagnosing connectivity problems and understanding the performance of your ExpressRoute connection.", + "type": "recommendation", + "guid": "28c11605-f9af-479f-b854-d5467ff4160a" + }, + { + "waf": "Operations", + "service": "Azure Expressroute", + "text": "Review ExpressRoute metrics and create alerts. ExpressRoute uses Azure Monitor to collect metrics and create alerts based on your configuration. Follow the recommendations for designing and creating a monitoring system to implement your monitoring strategy for ExpressRoute and your workloads.", + "description": "Metrics are collected for ExpressRoute circuits, ExpressRoute gateways, ExpressRoute gateway connections, and ExpressRoute Direct. These metrics are useful for diagnosing connectivity problems and understanding the performance of your ExpressRoute connection.", + "type": "recommendation", + "guid": "a7239cf7-690a-40fe-8d6c-5355e27a51ab" }, { "waf": "performance", "service": "Azure Expressroute", - "text": "Upgrade the ExpressRoute circuit bandwidth.", + "text": "Test ExpressRoute gateway performance to meet work load requirements: Use the Azure Connectivity Toolkit to test performance across your ExpressRoute circuit to understand bandwidth capacity and latency of your network connection.", "description": "", "type": "checklist", - "guid": "102ee202-4b37-498a-8826-d698d11e3b03" + "guid": "78f84baf-1a25-4a05-a15b-a4eca91e82b2" }, { "waf": "performance", "service": "Azure Expressroute", - "text": "Enable ExpressRoute FastPath for higher throughput.", + "text": "Plan for scaling: Based on your scalability requirements, choose the right ExpressRoute circuit SKU and also the Virtual Network Gateway SKUs. Each SKU offers different features and limits. Take into consideration the performance, feature, and routing needs of your network. For additional scalability guidance for your solution, see Recommendations for optimizing scaling and partitioning.", "description": "", "type": "checklist", - "guid": "627c2d5f-e638-41fd-be98-9ba1bf195ce3" + "guid": "fddf3631-8314-4e23-b91b-faea977980a0" }, { "waf": "performance", "service": "Azure Expressroute", - "text": "Monitor the ExpressRoute circuit and gateway metrics.", + "text": "Monitor the performance of ExpressRoute resources: Collect and analyze the performance telemetry in accordance with the WAF Recommendations for collecting performance data. Validate that it meets your performance targets and set up alerts to proactively notify you when a certain threshold is met.", "description": "", "type": "checklist", - "guid": "040f4b75-2706-42f3-9a9c-cee611032d91" + "guid": "91a2fd3e-2f49-4dc0-96d1-48cdf73846d2" + }, + { + "waf": "Performance", + "service": "Azure Expressroute", + "text": "Test ExpressRoute gateway performances to meet work load requirements with the Azure Connectivity Toolkit. Schedule bandwidth-intensive operations such as backups and performance testing at times of low production traffic.", + "description": "The toolkit provides user-friendly tools and interfaces that simplify the process of configuring and managing network connections to Azure. The toolkit includes tools to optimize network performance, ensuring efficient and reliable connectivity to Azure services.", + "type": "recommendation", + "guid": "0f156e04-4748-4098-8646-41c20e4fda2c" }, { "waf": "Performance", "service": "Azure Expressroute", - "text": "Test ExpressRoute gateway performance to meet work load requirements.", - "description": "Use Azure Connectivity Toolkit to test performance across your ExpressRoute circuit to understand bandwidth capacity and latency of your network connection.", + "text": "Plan for scaling of ExpressRoute circuits. Upgrade your ExpressRoute circuit bandwidth to meet your production workload requirements. Circuit bandwidth is shared between all virtual networks connected to the ExpressRoute circuit. Depending on your workload, one or more virtual networks can use up all the bandwidth on the circuit. For more information, see ExpressRoute limits.", + "description": "Upgrading the bandwidth ensures that the network can handle increasing data volumes and more users without compromising performance.", "type": "recommendation", - "guid": "256753af-fb4b-49b2-a965-4b65265ee8dd" + "guid": "505a4796-5aff-429b-b37a-f19dd639e398" }, { "waf": "Performance", "service": "Azure Expressroute", - "text": "Increase the size of the ExpressRoute gateway.", - "description": "Upgrade to a higher gateway SKU for improved throughput performance between on-premises and Azure environment.", + "text": "Plan for scaling of ExpressRoute Virtual Network Gateway. Upgrade your ExpressRoute Virtual Network Gateway SKU to meet your production workload requirements.", + "description": "Upgrading to a larger gateway SKU provides higher throughput capabilities, allowing more data to be transferred between on-premises networks and Azure more quickly. A larger gateway can manage more simultaneous connections and higher volumes of traffic, reducing the likelihood of network congestion and bottlenecks.", "type": "recommendation", - "guid": "9bc85bda-be71-4df0-924c-2604ef7f05fa" + "guid": "f35ae380-4c7d-46c7-87fb-50ded237e6ef" }, { "waf": "Performance", "service": "Azure Expressroute", - "text": "Upgrade ExpressRoute circuit bandwidth", - "description": "Upgrade your circuit bandwidth to meet your work load requirements. Circuit bandwidth is shared between all virtual networks connected to the ExpressRoute circuit. Depending on your work load, one or more virtual networks can use up all the bandwidth on the circuit.", + "text": "Configure Scalable Gateways to automatically scale for performance.", + "description": "Scalable Gateways allows you to scale up and down automatically with your gateway instances to accommodate performance needs. ErGwScale SKU also enables you to achieve 40-Gbps connectivity to virtual machines and Private Endpoints within the virtual network.", "type": "recommendation", - "guid": "db8f8202-db07-497f-be72-17db8bda90c5" + "guid": "88f6c371-e433-4413-abb0-9464392db775" }, { "waf": "Performance", "service": "Azure Expressroute", - "text": "Enable ExpressRoute FastPath for higher throughput", - "description": "If you're using an Ultra performance or an ErGW3AZ virtual network gateway, you can enable FastPath to improve the data path performance between your on-premises network and Azure virtual network.", + "text": "Enable ExpressRoute FastPath for higher throughput on your virtual network gateway.", + "description": "This feature improves the data path performance between your on-premises network and your virtual network resources by bypassing the gateway. As business needs grow, FastPath provides the necessary bandwidth and performance to support increasing data volumes and more users without compromising performance. Enabling FastPath ensures that the network can handle future expansions and new applications, providing long-term performance efficiency.", "type": "recommendation", - "guid": "01566559-f881-409b-b04e-7d79a71f18e4" + "guid": "8b1d16e1-938c-4bbb-b548-4bef658bd167" }, { "waf": "Performance", "service": "Azure Expressroute", - "text": "Monitor ExpressRoute circuit and gateway metrics", - "description": "Set up alerts base on ExpressRoute metrics to proactively notify you when a certain threshold is met. These metrics are useful to understand anomalies that can happen with your ExpressRoute connection such as outages and maintenance happening to your ExpressRoute circuits.", + "text": "Monitor Monitor ExpressRoute circuit, port, and gateway metrics. Configure alerts for ExpressRoute metrics to proactively notify you when a certain threshold is met. ExpressRoute circuit metrics supports metrics such as Arp Availability, BitsInPerSecond, DroppedInBitsPerSecond. ExpressRoute port metrics supports metrics such as AdminState, BitsInPerSecond, and FastPathRoutesCount. ExpressRoute Gateway metrics supports metrics such as Bits In Per Second, Active Flows, and Count Of Routes Advertised to Peer.
Monitor performance targets with Connection Monitor.", + "description": "ExpressRoute circuit, port, and gateway metrics are useful to understand anomalies that can happen with your ExpressRoute connection such as outages and maintenance happening to your ExpressRoute circuits. Connection Monitor can detect networking issues by identifying where along the network path the problem is and help you quickly resolve configuration or hardware failures.", "type": "recommendation", - "guid": "6440df71-d371-4190-920f-01c1815446db" + "guid": "8727a955-6243-4408-89dd-a0befb7742f3" }, { "waf": "reliability", @@ -1804,10 +2052,10 @@ { "waf": "Reliability", "service": "Azure Files", - "text": "As a part of your backup and recovery strategy, enable\u202fsoft delete\u202fand\u202fuse snapshots for point-in-time restore. You can use Azure Backup to back up your SMB file shares. You can also use Azure File Sync to back up on-premises SMB file shares to an Azure file share. Azure Backup also allows you to do a vaulted backup (preview) of Azure Files to protect your data from ransomware attacks or source data loss due to a malicious actor or rogue admin. By using vaulted backup, Azure Backup copies and stores data in the Recovery Services vault. This creates an offsite copy of data that you can retain for up to 99 years. Azure Backup creates and manages the recovery points as per the schedule and retention defined in the backup policy. Learn more.", + "text": "As a part of your backup and recovery strategy, enable\u202fsoft delete\u202fand\u202fuse snapshots for point-in-time restore. You can use Azure Backup to back up your SMB file shares. You can also use Azure File Sync to back up on-premises SMB file shares to an Azure file share. Azure Backup also allows you to do a vaulted backup of Azure Files to protect your data from ransomware attacks or source data loss due to a malicious actor or rogue admin. By using vaulted backup, Azure Backup copies and stores data in the Recovery Services vault. This creates an offsite copy of data that you can retain for up to 99 years. Azure Backup creates and manages the recovery points as per the schedule and retention defined in the backup policy. Learn more.", "description": "Soft delete works on a file share level to protect Azure file shares against accidental deletion. Point-in-time restore protects against accidental deletion or corruption because you can restore file shares to an earlier state. For more information, see Data protection overview.", "type": "recommendation", - "guid": "8bcb6fde-bf94-4f36-8eae-347e5d4f0dea" + "guid": "6df35085-d01a-4ac9-8fa7-b01f33bb580f" }, { "waf": "security", @@ -2644,26 +2892,18 @@ { "waf": "reliability", "service": "Azure Front Door", - "text": "Estimate the traffic pattern and volume. The number of requests from the client to the Azure Front Door edge might influence your tier choice. If you need to support a high volume of requests, consider the Azure Front Door Premium tier because performance ultimately impacts availability. However, there's a cost tradeoff. These tiers are described in Performance Efficiency.", + "text": "Choose your deployment strategy. The fundamental deployment approaches are active-active and active-passive. Active-active deployment means that multiple environments or stamps that run the workload serve traffic. Active-passive deployment means that only the primary region handles all traffic, but it fails over to the secondary region when necessary. In a multiregion deployment, stamps or application instances run in different regions for higher availability with a global load balancer, like Azure Front Door, that distributes traffic. Therefore, it's important to configure the load balancer for the appropriate deployment approach.", "description": "", "type": "checklist", - "guid": "59ed40bd-06a0-4125-ab99-afc88a248aa5" + "guid": "05abc5bc-5373-4a9e-bad6-4facaf075111" }, { "waf": "reliability", "service": "Azure Front Door", - "text": "Choose your deployment strategy. The fundamental deployment approaches are active-active and active-passive. Active-active deployment means that multiple environments or stamps that run the workload serve traffic. Active-passive deployment means that only the primary region handles all traffic, but it fails over to the secondary region when necessary. In a multiregion deployment, stamps run in different regions for higher availability with a global load balancer, like Azure Front Door, that distributes traffic. Therefore, it's important to configure the load balancer for the appropriate deployment approach.", + "text": "Use the same host name on each layer. To ensure that cookies or redirect URLs work properly, preserve the original HTTP host name when you use a reverse proxy, like Azure Front Door, in front of a web application.", "description": "", "type": "checklist", - "guid": "51f023f3-53b4-4878-8548-2b08a6b095ab" - }, - { - "waf": "reliability", - "service": "Azure Front Door", - "text": "Use the same host name on Azure Front Door and origin servers. To ensure that cookies or redirect URLs work properly, preserve the original HTTP host name when you use a reverse proxy, like a load balancer, in front of a web application.", - "description": "", - "type": "checklist", - "guid": "486f318a-8747-45e5-a5f4-97642d4fada6" + "guid": "7fdbb044-847a-4944-87b6-67107e2d208f" }, { "waf": "reliability", @@ -2676,10 +2916,10 @@ { "waf": "reliability", "service": "Azure Front Door", - "text": "Take advantage of the built-in content delivery network functionality in Azure Front Door. The content delivery network feature of Azure Front Door has hundreds of edge locations and can help withstand distributed denial of service (DDoS) attacks. These capabilities help improve reliability.", + "text": "Cache static content. The content delivery feature of Azure Front Door has hundreds of edge locations and can help withstand traffic surges and distributed denial of service (DDoS) attacks. These capabilities help improve reliability.", "description": "", "type": "checklist", - "guid": "96c8dc89-cafe-411e-9080-35c13fffb8b3" + "guid": "b6d60b2d-ef45-487d-8967-3aa90178ce9b" }, { "waf": "reliability", @@ -2692,48 +2932,48 @@ { "waf": "Reliability", "service": "Azure Front Door", - "text": "Choose a routing method that supports your deployment strategy. The weighted method, which distributes traffic based on the configured weight coefficient, supports active-active models. A priority-based value that configures the primary region to receive all traffic and send traffic to the secondary region as a backup supports active-passive models. Combine the preceding methods with latency so that the origin with the lowest latency receives traffic.", + "text": "Choose a routing method that supports your deployment strategy. The weighted method, which distributes traffic based on the configured weight coefficient, supports active-active models. A priority-based value that configures the primary region to receive all traffic and send traffic to the secondary region as a backup supports active-passive models. Combine the preceding methods with latency sensitivity configurations so that the origin with the lowest latency receives traffic.", "description": "You can select the best origin resource by using a series of decision steps and your design. The selected origin serves traffic within the allowable latency range in the specified ratio of weights.", "type": "recommendation", - "guid": "1a6fba56-5098-4506-9be0-940fe556996c" + "guid": "de0f183f-2be0-44d2-8ffd-7c2f2193177d" }, { "waf": "Reliability", "service": "Azure Front Door", - "text": "Support redundancy by having multiple origins in one or more back-end pools. Always have redundant instances of your application and make sure each instance exposes an endpoint or origin. You can place those origins in one or more back-end pools.", - "description": "Multiple origins support redundancy by distributing traffic across multiple instances of the application. If one instance is unavailable, then other back-end origins can still receive traffic.", + "text": "Support redundancy by having multiple origins in one or more origin groups. Always have redundant instances of your application and make sure each instance exposes an origin. You can place those origins in one or more origin groups.", + "description": "Multiple origins support redundancy by distributing traffic across multiple instances of the application. If one instance is unavailable, then other origins can still receive traffic.", "type": "recommendation", - "guid": "00f51ce2-46a9-4051-ab0e-762743d0837d" + "guid": "61363c69-6b55-4c96-b0fd-59294a74bcb8" }, { "waf": "Reliability", "service": "Azure Front Door", - "text": "Set up health probes on the origin. Configure Azure Front Door to conduct health checks to determine if the back-end instance is available and ready to continue receiving requests.", - "description": "Enabled health probes are part of the health monitoring pattern implementation. Health probes make sure that Azure Front Door only routes traffic to instances that are healthy enough to handle requests. For more information, see Best practices on health probes.", + "text": "Set up health probes on the origin. Configure Azure Front Door to conduct health checks to determine if the origin instance is available and ready to continue receiving requests. For more information, see Best practices on health probes.", + "description": "Enabled health probes are part of the health monitoring pattern implementation. Health probes make sure that Azure Front Door only routes traffic to instances that are healthy enough to handle requests.", "type": "recommendation", - "guid": "17fbec2c-ddb4-4490-946c-a151ae0fadd4" + "guid": "f2e05285-5b06-4a98-af9d-3c504067e20a" }, { "waf": "Reliability", "service": "Azure Front Door", - "text": "Set a timeout on forwarding requests to the back end. Adjust the timeout setting according to your endpoints' needs. If you don't, Azure Front Door might close the connection before the origin sends the response. You can also lower the default timeout for Azure Front Door if all of your origins have a shorter timeout. For more information, see Troubleshooting unresponsive requests.", - "description": "Timeouts help prevent performance issues and availability issues by terminating requests that take longer than expected to complete.", + "text": "Set a timeout on forwarding requests to the origin, and avoid long-running requests. Adjust the timeout setting according to your endpoints' needs. If you don't, Azure Front Door might close the connection before the origin sends the response. You can also lower the default timeout for Azure Front Door if all of your origins have a shorter timeout. For more information, see Troubleshooting unresponsive requests.", + "description": "Long-running requests consume system resources. Timeouts help prevent performance issues and availability issues by terminating requests that take longer than expected to complete.", "type": "recommendation", - "guid": "1a308f11-1d93-4d57-bd84-cbd8f6198dd2" + "guid": "dd71828a-09b2-415a-863e-2274f527d581" }, { "waf": "Reliability", "service": "Azure Front Door", - "text": "Use the same host name on Azure Front Door and your origin. Azure Front Door can rewrite the host header of incoming requests, which is useful when you have multiple custom domain names that route to one origin. However, rewriting the host header might cause issues with request cookies and URL redirection.", - "description": "Set the same host name to prevent malfunction with session affinity, authentication, and authorization. For more information, see Preserve the original HTTP host name between a reverse proxy and its back-end web application.", + "text": "Use the same host name on Azure Front Door and your origin. Azure Front Door can rewrite the host header of incoming requests, which is useful when you have multiple custom domain names that route to one origin. However, rewriting the host header might cause issues with request cookies and URL redirection. For more information, see Preserve the original HTTP host name.", + "description": "Set the same host name to prevent malfunction with session affinity, authentication, and authorization.", "type": "recommendation", - "guid": "7af90aa7-b21f-432f-858b-2d872c752d7f" + "guid": "c62dba2a-4479-43d8-9999-17ef410ba6cb" }, { "waf": "Reliability", "service": "Azure Front Door", "text": "Decide if your application requires session affinity. If you have high reliability requirements, we recommend that you disable session affinity.", - "description": "With session affinity, user connections stay on the same origin during the user session. If that origin becomes unavailable, the user experience might be disrupted.", + "description": "With session affinity, user connections stay on the same origin during the user session. In some situations, a single origin might become overloaded with requests while other origins are idle. If that origin becomes unavailable, the user experience might be disrupted.", "type": "recommendation", "guid": "f66a8d49-8d0a-4952-9db2-ac2e526f08ad" }, @@ -2756,10 +2996,10 @@ { "waf": "security", "service": "Azure Front Door", - "text": "Protect the back-end servers. The front end acts as the single point of ingress to the application.", + "text": "Protect the origin servers. Azure Front Door is the front end, and is the single point of ingress to the application.", "description": "", "type": "checklist", - "guid": "b31346d7-4fde-42c2-82fe-2e4d54aa8f1b" + "guid": "d0fb78cb-c9e5-46ce-9a2a-5111ae9d57ca" }, { "waf": "security", @@ -2780,10 +3020,10 @@ { "waf": "security", "service": "Azure Front Door", - "text": "Protect Azure Front Door against unexpected traffic. Azure Front Door uses the basic plan of Azure DDoS protection to protect application endpoints from DDoS attacks. If you need to expose other public IP addresses from your application, consider adding the DDoS Protection standard plan for those addresses for advanced protection and detection capabilities.", + "text": "Protect against unexpected traffic. The architecture of Azure Front Door provides built-in DDoS protection to protect application endpoints from DDoS attacks. If you need to expose other public IP addresses from your application, consider adding the Azure DDoS Protection standard plan for those addresses for advanced protection and detection capabilities.", "description": "", "type": "checklist", - "guid": "d323ca3c-f7df-4f7e-b7c9-698d2bdad3e2" + "guid": "a1e91410-cc71-4093-90d2-77d581f7c11c" }, { "waf": "security", @@ -2817,29 +3057,45 @@ "type": "recommendation", "guid": "a6fe49f4-0b1f-4677-af1f-b766f073ac6c" }, + { + "waf": "Security", + "service": "Azure Front Door", + "text": "Send the host header to the origin.", + "description": "The back-end services should be aware of the host name so that they can create rules to accept traffic only from that host.", + "type": "recommendation", + "guid": "c23845cd-a40f-46ab-88f6-ef0036595e15" + }, + { + "waf": "Security", + "service": "Azure Front Door", + "text": "Secure the connections from Azure Front Door to your origins. Enable Private Link connectivity to supported origins. If your origin doesn't support Private Link connectivity, use service tags and the `X-Azure-FDID` header to verify the source of the request is your Azure Front Door profile.", + "description": "Ensure that all traffic flows through Azure Front Door, and gets the security benefits such as DDoS protection and WAF inspection.", + "type": "recommendation", + "guid": "de60a178-a3c0-4730-884e-89ee2b0290ba" + }, { "waf": "Security", "service": "Azure Front Door", "text": "Enable end-to-end TLS, HTTP to HTTPS redirection, and managed TLS certificates when applicable. Review the TLS best practices for Azure Front Door. Use TLS version 1.2 as the minimum allowed version with ciphers that are relevant for your application. Azure Front Door managed certificates should be your default choice for ease of operations. However, if you want to manage the lifecycle of the certificates, use your own certificates in Azure Front Door custom domain endpoints and store them in Key Vault.", - "description": "TLS ensures that data exchanges between the browser, Azure Front Door, and the back-end origins are encrypted to prevent tampering. Key Vault offers managed certificate support and simple certificate renewal and rotation.", + "description": "TLS ensures that data exchanges between the browser, Azure Front Door, and the origins are encrypted to prevent tampering. Key Vault offers managed certificate support and simple certificate renewal and rotation.", "type": "recommendation", "guid": "450b511f-9e82-45ce-986b-769ea00dbeaa" }, { "waf": "cost", "service": "Azure Front Door", - "text": "Review Azure Front Door tiers and pricing. Use the pricing calculator to estimate the realistic costs for each tier. Compare the features and suitability of each tier for your scenario. For instance, only the Premium tier supports connecting to your origin via Private Link.", + "text": "Review service tiers and pricing. Use the pricing calculator to estimate the realistic costs for each tier of Azure Front Door. Compare the features and suitability of each tier for your scenario. For instance, only the Premium tier supports connecting to your origin via Private Link.", "description": "", "type": "checklist", - "guid": "baeb625f-93ac-4181-be7b-07f474584985" + "guid": "72190ce1-1bdb-4014-931f-b9868d297bc4" }, { "waf": "cost", "service": "Azure Front Door", - "text": "Consider bandwidth costs. The bandwidth costs of Azure Front Door depend on the tier that you choose and the type of data transfer. Azure Front Door provides built-in reports for billable metrics. To assess your costs related to bandwidth and where you can focus your optimization efforts, see Azure Front Door reports.", + "text": "Consider bandwidth costs. The bandwidth costs of Azure Front Door depend on the tier that you choose and the type of data transfer. To learn about Azure Front Door billing, see Understand Azure Front Door billing.", "description": "", "type": "checklist", - "guid": "ed02627c-4a6d-4d6e-982f-a1fbee4a7259" + "guid": "98e706b0-e72b-4729-85e7-88b6690af882" }, { "waf": "cost", @@ -2892,10 +3148,10 @@ { "waf": "Cost", "service": "Azure Front Door", - "text": "Disable health checks in single back-end pools.If you have only one origin configured in your Azure Front Door origin group, these calls are unnecessary.", - "description": "You can save on bandwidth costs by disabling requests that aren't required to make routing decisions.", + "text": "Disable health checks in origin groups with a single origin.If you have only one origin configured in your Azure Front Door origin group, these calls are unnecessary.", + "description": "You can save on bandwidth costs by disabling health check requests that aren't required to make routing decisions.", "type": "recommendation", - "guid": "5d3ad5cb-d897-4ae7-8d1c-ae8398c79ed6" + "guid": "4e149ec6-37b6-4f8a-8e77-7e15926e9ced" }, { "waf": "operations", @@ -2916,18 +3172,18 @@ { "waf": "operations", "service": "Azure Front Door", - "text": "Handle progressive exposure by using Azure Front Door routing methods. For a weighted load balancing approach you can use a canary deployment to send a specific percentage of traffic to a back end. This approach helps you test new features and releases in a controlled environment before you roll them out.", + "text": "Handle progressive exposure. Azure Front Door provides multiple routing methods. For a weighted load balancing approach you can use a canary deployment to send a specific percentage of traffic to an origin. This approach helps you test new features and releases in a controlled environment before you roll them out.", "description": "", "type": "checklist", - "guid": "bb3943fb-6d97-458d-be6a-d9b71e8898d7" + "guid": "181a84dc-b0da-4038-a1ff-75fd9b97d327" }, { "waf": "operations", "service": "Azure Front Door", - "text": "Collect and analyze Azure Front Door operational data as part of your workload monitoring. Capture relevant Azure Front Door logs and metrics with Azure Monitor Logs. This data helps you troubleshoot, understand user behaviors, and optimize operations.", + "text": "Collect and analyze operational data as part of your workload monitoring. Capture relevant Azure Front Door logs and metrics with Azure Monitor Logs. This data helps you troubleshoot, understand user behaviors, and optimize operations.", "description": "", "type": "checklist", - "guid": "5e1264a9-c6cc-4d4e-8e73-7ca4b3f05c17" + "guid": "1fdc61d2-d6ea-4c4a-88f6-3bb5871f8051" }, { "waf": "operations", @@ -2988,10 +3244,10 @@ { "waf": "performance", "service": "Azure Front Door", - "text": "Analyze performance data by regularly reviewing Azure Front Door reports. These reports provide insights into various metrics that serve as performance indicators at the technology level.", + "text": "Analyze performance data by regularly reviewing performance metrics. Azure Front Door reports provide insights into various metrics that serve as performance indicators at the technology level.", "description": "", "type": "checklist", - "guid": "87c450a0-6a4a-453c-ae38-b8f21952ab79" + "guid": "6254e9d8-60a2-4c19-8c0b-09ddd477f19d" }, { "waf": "performance", @@ -3029,7 +3285,7 @@ "waf": "Performance", "service": "Azure Front Door", "text": "Enable caching. You can optimize query strings for caching. For purely static content, ignore query strings to maximize your use of the cache. If your application uses query strings, consider including them in the cache key. Including the query strings in the cache key allows Azure Front Door to serve cached responses or other responses, based on your configuration.", - "description": "Azure Front Door offers a robust content delivery network solution that caches content at the edge of the network. Caching reduces the load on the back-end servers and reduces data movement across the network, which helps offload bandwidth usage.", + "description": "Azure Front Door offers a robust content delivery network solution that caches content at the edge of the network. Caching reduces the load on the origin servers and reduces data movement across the network, which helps offload bandwidth usage.", "type": "recommendation", "guid": "2f6a39a1-884a-4661-accd-f534bf795a96" }, @@ -3052,618 +3308,506 @@ { "waf": "Performance", "service": "Azure Front Door", - "text": "Evaluate whether you should enable session affinity when requests from the same user should be directed to the same back-end server. From a reliability perspective, we don't recommend this approach. If you use this option, the application should gracefully recover without disrupting user sessions. There's also a tradeoff on load balancing because it restricts the flexibility of distributing traffic across multiple back ends evenly.", + "text": "Evaluate whether you should enable session affinity when requests from the same user should be directed to the same origin server. From a reliability perspective, we don't recommend this approach. If you use this option, the application should gracefully recover without disrupting user sessions. There's also a tradeoff on load balancing because it restricts the flexibility of distributing traffic across multiple origins evenly.", "description": "Optimize performance and maintain continuity for user sessions, especially when applications rely on maintaining state information locally.", "type": "recommendation", - "guid": "2a92f55e-a422-49f4-9a26-8ba819a17323" - }, - { - "waf": "reliability", - "service": "Azure Kubernetes Service", - "text": "Cluster architecture: For critical workloads, use availability zones for your AKS clusters.", - "description": "", - "type": "checklist", - "guid": "ad7540f5-4cb1-4fc5-b168-e1cd97a0aef6" + "guid": "534f3888-f695-422d-a437-c2c0c22ebcb2" }, { "waf": "reliability", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Plan the IP address space to ensure your cluster can reliably scale, including handling of failover traffic in multi-cluster topologies.", + "text": "(Cluster) Build redundancy to improve resiliency. Use availability zones for your AKS clusters as part of your resiliency strategy to increase availability when you deploy to a single region. Many Azure regions provide availability zones. The zones are close enough to have low-latency connections among them, but far enough apart to reduce the likelihood that local outages will affect more than one zone.", "description": "", "type": "checklist", - "guid": "904fdfa5-af9a-4195-b683-e3bb7627b394" + "guid": "fada31b9-472d-415d-85b0-263742bca250" }, { "waf": "reliability", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Review the Best practices for monitoring Kubernetes with Azure Monitor to determine the best monitoring strategy for your workloads.", + "text": "(Cluster and workload) Monitor reliability and overall health indicators of the cluster and workloads. Collect logs and metrics to monitor workload health, identify performance and reliability trends, and troubleshoot problems.", "description": "", "type": "checklist", - "guid": "00c83f06-95ab-41c2-aef0-c770e177066f" + "guid": "8704b542-be27-4b2e-891f-5648051757d4" }, { "waf": "reliability", "service": "Azure Kubernetes Service", - "text": "Workload architecture: Ensure workloads are built to support horizontal scaling and report application readiness and health.", + "text": "(Cluster and workload) Host application pods in user node pools. By isolating system pods from application workloads, you help ensure that AKS essential services are unaffected by the resource demands or potential problems caused by a workload that runs user node pools.", "description": "", "type": "checklist", - "guid": "fa12c27b-df70-41fd-a93a-680e3cacf7ac" + "guid": "4f7c4919-2e0a-43ae-97c4-5b5d32d23b39" }, { "waf": "reliability", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Ensure your workload is running on user node pools and chose the right size SKU. At a minimum, include two nodes for user node pools and three nodes for the system node pool.", + "text": "(Cluster and workload) Factor the AKS uptime service-level agreement (SLA) into your availability and recovery targets. To define the reliability and recovery targets for your cluster and workload, follow the guidance in Recommendations for defining reliability targets. Then formulate a design that meets those targets.", "description": "", "type": "checklist", - "guid": "f2297ffc-fec7-43c6-9187-9739264c9d66" + "guid": "b84b74df-a04b-478e-9e4c-7bc413ca376b" }, { "waf": "reliability", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Use the AKS Uptime SLA to meet availability targets for production workloads.", + "text": "(Cluster and workload) Protect the AKS cluster service using Azure Backup by storing recovery points in a Backup vault and perform restore during any disaster scenario. To back up and restore the containerized applications and data running in AKS clusters, follow the guidance in the AKS backup overview for configuring protection.", "description": "", "type": "checklist", - "guid": "0b542d27-1846-4d1b-9c5e-bd7c0d079cb3" + "guid": "71e87a20-a241-4ead-90f5-89fdf152d0cd" }, { "waf": "Reliability", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Control pod scheduling using node selectors and affinity.", - "description": "Allows the Kubernetes scheduler to logically isolate workloads by hardware in the node. Unlike tolerations, pods without a matching node selector can be scheduled on labeled nodes, which allows unused resources on the nodes to consume, but gives priority to pods that define the matching node selector. Use node affinity for more flexibility, which allows you to define what happens if the pod can't be matched with a node.", + "text": "(Cluster and workload) Control pod scheduling by using node selectors and affinity. In AKS, the Kubernetes scheduler can logically isolate workloads by hardware in the node. Unlike tolerations, pods that don't have a matching node selector can be scheduled on labeled nodes, but priority is given to pods that define the matching node selector.", + "description": "Node affinity results in more flexibility, which allows you to define what happens if the pod can't be matched with a node.", "type": "recommendation", - "guid": "b11892cd-f678-4399-bac8-b98095e250c6" + "guid": "b15637f2-0c2d-4905-80e1-19ac132c1d51" }, { "waf": "Reliability", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Ensure proper selection of network plugin based on network requirements and cluster sizing.", - "description": "Azure CNI is required for specific scenarios, for example, Windows-based node pools, specific networking requirements and Kubernetes Network Policies. Reference Kubenet versus Azure CNI for more information.", + "text": "(Cluster) Choose the appropriate network plugin based on network requirements and cluster sizing. Different network plugins provide varying levels of functionality. Azure Container Networking Interface (Azure CNI) is required for specific scenarios, such as Windows-based node pools, some networking requirements, and Kubernetes network policies. For more information, see Kubenet versus Azure CNI.", + "description": "The right network plugin can help ensure better compatibility and performance.", "type": "recommendation", - "guid": "9e3e8182-95e7-4953-9a2b-9cc2f77bf717" + "guid": "a91d0fc3-b8b0-4551-94bf-15f234abd638" }, { "waf": "Reliability", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Use the AKS Uptime SLA for production grade clusters.", - "description": "The AKS Uptime SLA guarantees: - `99.95%` availability of the Kubernetes API server endpoint for AKS Clusters that use Azure Availability Zones, or - `99.9%` availability for AKS Clusters that don't use Azure Availability Zones.", + "text": "(Cluster and workload) Use the AKS uptime SLA for production-grade clusters.", + "description": "The workload can support higher availability targets because of the higher availability guarantees of the Kubernetes API server endpoint for AKS clusters.", "type": "recommendation", - "guid": "4bfaa73e-a949-404a-a93b-a2b0a7feeff5" + "guid": "8a529829-04be-4b89-a7ad-e028e8999ddb" }, { "waf": "Reliability", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Use availability zones to maximize resilience within an Azure region by distributing AKS agent nodes across physically separate data centers.", - "description": "By spreading node pools across multiple zones, nodes in one node pool will continue running even if another zone has gone down. If colocality requirements exist, either a regular VMSS-based AKS deployment into a single zone or proximity placement groups can be used to minimize internode latency.", + "text": "(Cluster) Use availability zones to maximize resilience within an Azure region by distributing AKS agent nodes across physically separate datacenters.If colocality requirements exist, use a regular virtual machine scale sets-based AKS deployment into a single zone or use proximity placement groups to minimize internode latency.", + "description": "By spreading node pools across multiple zones, nodes in one node pool continue to run even if another zone goes down.", "type": "recommendation", - "guid": "74ff8612-55b7-4029-81bc-da363b133f16" + "guid": "d4762bd7-b189-4b8c-86a1-8ec2f5a79601" }, { "waf": "Reliability", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Adopt a multiregion strategy by deploying AKS clusters deployed across different Azure regions to maximize availability and provide business continuity.", - "description": "Internet facing workloads should leverage Azure Front Door or Azure Traffic Manager to route traffic globally across AKS clusters.", + "text": "(Cluster and workload) Define pod resource requests and limits in application deployment manifests. Enforce those limits by using Azure Policy.", + "description": "Container CPU and memory resource limits are necessary to prevent resource exhaustion in your Kubernetes cluster.", "type": "recommendation", - "guid": "844d923f-cfe0-4a3a-97ff-67c072c4220c" + "guid": "c694e2b0-d685-4009-aba8-0f0f8c7027e9" }, { "waf": "Reliability", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Define Pod resource requests and limits in application deployment manifests, and enforce with Azure Policy.", - "description": "Container CPU and memory resource limits are necessary to prevent resource exhaustion in your Kubernetes cluster.", + "text": "(Cluster and workload) Keep the system node pool isolated from application workloads.System node pools require a virtual machine (VM) SKU of at least 2 vCPUs and 4 GB of memory. We recommend that you use 4 vCPU or more. For more information, see System and user node pools.", + "description": "The system node pool hosts critical system pods that are essential for the control plane of your cluster. By isolating these system pods from application workloads, you help ensure that the essential services are unaffected by the resource demands or potential problems caused by a workload.", "type": "recommendation", - "guid": "f1a92d3f-eab5-4dc2-b0e8-75865842f205" + "guid": "9c3edc9b-84b8-402b-adc6-b21773de115d" }, { "waf": "Reliability", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Keep the System node pool isolated from application workloads.", - "description": "System node pools require a VM SKU of at least 2 vCPUs and 4 GB memory, but 4 vCPU or more is recommended. Reference System and user node pools for detailed requirements.", + "text": "(Cluster and workload) Separate applications to dedicated node pools based on specific requirements. Avoid large numbers of node pools to reduce management overhead.", + "description": "Applications can share the same configuration and need GPU-enabled VMs, CPU or memory-optimized VMs, or the ability to scale to zero. By dedicating node pools to specific applications, you can help ensure that each application gets the resources it needs without overprovisioning or underutilizing resources.", "type": "recommendation", - "guid": "55429203-5f2a-4ed1-9107-22d2c47b8ef1" + "guid": "e746533c-133f-432d-aaa9-93ef1ed2223d" }, { "waf": "Reliability", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Separate applications to dedicated node pools based on specific requirements.", - "description": "Applications may share the same configuration and need GPU-enabled VMs, CPU or memory optimized VMs, or the ability to scale-to-zero. Avoid large number of node pools to reduce extra management overhead.", + "text": "(Cluster) Use a NAT gateway for clusters that run workloads that make many concurrent outbound connections.", + "description": "Azure NAT Gateway supports reliable egress traffic at scale and helps you avoid reliability problems by applying Azure Load Balancer limitations to high concurrent outbound traffic.", "type": "recommendation", - "guid": "3603ec74-ada6-4050-baa3-fb25386fb7df" + "guid": "be2932c0-8f2f-45a6-b96f-ccc10bed8fdb" }, { "waf": "Reliability", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Use a NAT gateway for clusters that run workloads that make many concurrent outbound connections.", - "description": "To avoid reliability issues with Azure Load Balancer limitations with high concurrent outbound traffic, us a NAT Gateway instead to support reliable egress traffic at scale.", + "text": "(Cluster and workload) Use Azure Backup to protect AKS cluster and restore to alternate regions during disaster. Azure Backup supports the backup and restore operations of containerized applications and data running for both cluster state and application data. You can use the backups in a regional disaster scenario and recover backups.", + "description": "Azure Backup with Azure Kubernetes Service (AKS) offers a fully managed, scalable, secure, and cost-effective solution. Enhances the reliability of the workload without the complexities of setting up and maintaining backup infrastructure.", "type": "recommendation", - "guid": "d441f009-ca99-4caf-9544-c71de5949d10" + "guid": "e5b54fd3-fb36-44c8-90b2-10999ce8a518" }, { "waf": "security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Use Managed Identities to avoid managing and rotating service principles.", + "text": "(Cluster) Integrate with Microsoft Entra ID for identity and access mangement. Centralize identity management for your cluster by using Microsoft Entra ID. Any change in user account or group status is automatically updated in access to the AKS cluster. Establish identity as the primary security perimeter. The developers and application owners of your Kubernetes cluster need access to different resources.", "description": "", "type": "checklist", - "guid": "ff274266-a326-4c2c-9b85-7ce50c679b36" + "guid": "d13f9449-59b2-4c14-a332-ff44ee174ed7" }, { "waf": "security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Use Kubernetes role-based access control (RBAC) with Microsoft Entra ID for least privilege access and minimize granting administrator privileges to protect configuration, and secrets access.", + "text": "(Cluster) Integrate with security monitoring and security information and event management tools. Use Microsoft Defender for Containers with Microsoft Sentinel to detect and quickly respond to threats across your clusters and the workloads that run on them. Enable AKS connector for Microsoft Sentinel to stream your AKS diagnostics logs into Microsoft Sentinel.", "description": "", "type": "checklist", - "guid": "d44b3684-99d7-4946-ab75-fb58d9d8626e" + "guid": "95cdc6b5-a61f-47e3-98cf-e6e1d4b26999" }, { "waf": "security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Use Microsoft Defender for containers with Azure Sentinel to detect and quickly respond to threats across your cluster and workloads running on them.", + "text": "(Cluster and workload) Implement segmentation and network controls. To prevent data exfiltration, ensure that only authorized and safe traffic is allowed, and contain the blast radius of a security breach.", "description": "", "type": "checklist", - "guid": "12e0cc02-b585-4d6f-97be-1b073e57522c" + "guid": "fab4d0f5-8249-441e-9068-4a934b2c7ac0" }, { "waf": "security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Deploy a private AKS cluster to ensure cluster management traffic to your API server remains on your private network. Or use the API server allow list for non-private clusters.", + "text": "(Workload) Use a web application firewall (WAF) to scan incoming traffic for potential attacks. WAF can detect and mitigate threats in real time to help block malicious traffic before it reaches your applications. It provides robust protection against common web-based attacks, such as SQL injection, cross-site scripting, and other Open Web Application Security Project vulnerabilities. Some load balancers, such as Azure Application Gateway or Azure Front Door have an integrated WAF.", "description": "", "type": "checklist", - "guid": "8e9c196f-b93a-46c5-9507-1e5cad83dd21" + "guid": "9bd15b41-5aec-4acc-8f2f-b59eae92e7b6" }, { "waf": "security", "service": "Azure Kubernetes Service", - "text": "Workload architecture: Use a Web Application Firewall to secure HTTP(S) traffic.", + "text": "(Workload) Maintain a hardened workload's software supply chain. Ensure that your continuous integration and continuous delivery pipeline is hardened with container-aware scanning.", "description": "", "type": "checklist", - "guid": "4658f193-0b5d-41c9-b2c9-0a5f500799ea" + "guid": "99dcb350-bbe0-4623-8a59-1042db3ba142" }, { "waf": "security", "service": "Azure Kubernetes Service", - "text": "Workload architecture: Ensure your CI/CID pipeline is hardened with container-aware scanning.", + "text": "(Cluster and workload) Implement extra protection for specialized secure workloads. If your cluster needs to run a sensitive workload, you might need to deploy a private cluster. Here are some examples:", "description": "", "type": "checklist", - "guid": "feba8909-0760-4a87-833a-b36fabbe722e" + "guid": "d8b9dd18-8f6b-4d6a-a6b5-d372f09a46c6" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Use Microsoft Entra integration.", - "description": "Using Microsoft Entra ID centralizes the identity management component. Any change in user account or group status is automatically updated in access to the AKS cluster. The developers and application owners of your Kubernetes cluster need access to different resources.", + "text": "(Cluster) Use managed identities on the cluster.", + "description": "You can avoid the overhead associated with managing and rotating service principles.", "type": "recommendation", - "guid": "463da713-377d-46e1-a9cf-a384a0657537" + "guid": "322537de-47dc-4791-8a6d-27f72f08530b" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Authenticate with Microsoft Entra ID to Azure Container Registry.", - "description": "AKS and Microsoft Entra ID enables authentication with Azure Container Registry without the use of `imagePullSecrets` secrets. Review Authenticate with Azure Container Registry from Azure Kubernetes Service for more information.", + "text": "(Workload) Use Microsoft Entra Workload ID with AKS to access Microsoft Entra protected resources, such as Azure Key Vault and Microsoft Graph, from your workload.", + "description": "Use AKS Workload IDs to protect access to Azure resources by using Microsoft Entra ID RBAC without having to manage credentials directly in your code.", "type": "recommendation", - "guid": "f9f7946f-b9b6-40d1-9f30-0bfb38be5c74" + "guid": "6d65d848-6d76-4fa2-bcad-421545a0e58c" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Secure network traffic to your API server with private AKS cluster.", - "description": "By default, network traffic between your node pools and the API server travels the Microsoft backbone network; by using a private cluster, you can ensure network traffic to your API server remains on the private network only.", + "text": "(Cluster) Use Microsoft Entra ID to authenticate with Azure Container Registry from AKS.", + "description": "By using Microsoft Entra ID, AKS can authenticate with Container Registry without the use of `imagePullSecrets` secrets.", "type": "recommendation", - "guid": "10d61300-19e4-403c-bf22-a554c56b4afc" + "guid": "fc35f791-aa32-4dc7-a14c-9d63dd81c266" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: For non-private AKS clusters, use API server authorized IP ranges.", - "description": "When using public clusters, you can still limit the traffic that can reach your clusters API server by using the authorized IP range feature. Include sources like the public IPs of your deployment build agents, operations management, and node pools' egress point (such as Azure Firewall).", + "text": "(Cluster) Secure network traffic to your API server by using private AKS cluster if the workload requirements require higher levels of segmentation.", + "description": "By default, network traffic between your node pools and the API server travels the Microsoft backbone network. By using a private cluster, you can help ensure that network traffic to your API server remains on the private network only.", "type": "recommendation", - "guid": "b845e9a6-c5f3-4cf7-a956-6f1ffde88569" + "guid": "7028f08c-5586-4071-ad6e-bc41498ac80a" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Protect the API server with Microsoft Entra RBAC.", - "description": "Securing access to the Kubernetes API Server is one of the most important things you can do to secure your cluster. Integrate Kubernetes role-based access control (RBAC) with Microsoft Entra ID to control access to the API server. Disable local accounts to enforce all cluster access using Microsoft Entra ID-based identities.", + "text": "(Cluster) For public AKS clusters, use API server-authorized IP address ranges. Include sources like the public IP addresses of your deployment build agents, operations management, and node pools' egress point, such as Azure Firewall.", + "description": "When you use public clusters, you can significantly reduce the attack surface of your AKS cluster by limiting the traffic that can reach the API server of your clusters.", "type": "recommendation", - "guid": "0b3a4624-de6e-4cfe-af5b-19ad4708087d" + "guid": "5ac5c96d-0142-4c99-a215-f4666f17ff4a" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Use Azure network policies or Calico.", - "description": "Secure and control network traffic between pods in a cluster.", + "text": "(Cluster) Protect the API server by using Microsoft Entra ID RBAC.Disable local accounts to enforce all cluster access by using Microsoft Entra ID-based identities.", + "description": "Securing access to the Kubernetes API server is one of the most important things that you can do to secure your cluster. Integrate Kubernetes RBAC with Microsoft Entra ID to control access to the API server.", "type": "recommendation", - "guid": "caabd1f2-e6a9-4c55-9115-349d1d6716bb" + "guid": "c7ad1096-597d-4bd4-868f-f6b641771ce6" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Secure clusters and pods with Azure Policy.", - "description": "Azure Policy can help to apply at-scale enforcement and safeguards on your clusters in a centralized, consistent manner. It can also control what functions pods are granted and if anything is running against company policy.", + "text": "(Cluster) Use Azure network policies or Calico.", + "description": "By using policies, you can secure and control network traffic between pods in a cluster. Calico provides a richer set of capabilities, including policy ordering and priority, deny rules, and more flexible match rules.", "type": "recommendation", - "guid": "d65686d0-3988-49b5-8a98-59818ba81c50" + "guid": "3dff5328-9b1d-4cec-8232-54668b7af2fb" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Secure container access to resources.", - "description": "Limit access to actions that containers can perform. Provide the least number of permissions, and avoid the use of root or privileged escalation.", + "text": "(Cluster) Secure clusters and pods by using Azure Policy.", + "description": "Azure Policy can help apply at-scale enforcement and safeguards on your clusters in a centralized, consistent manner. It can also control what functions pods are granted and detect whether anything is running against company policy.", "type": "recommendation", - "guid": "f1e41c15-87e4-4135-ab6c-fb6803194f12" + "guid": "492d9708-1124-4f0a-b72a-1f706cfb68ed" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Workload architecture: Use a Web Application Firewall to secure HTTP(S) traffic.", - "description": "To scan incoming traffic for potential attacks, use a web application firewall such as Azure Web Application Firewall (WAF) on Azure Application Gateway or Azure Front Door.", + "text": "(Cluster) Secure container access to resources. Limit access to actions that containers can perform. Provide the least number of permissions, and avoid the use of root or privileged escalation.For Linux based containers, see Security container access to resources using built-in Linux security features.", + "description": "By restricting permissions and avoiding the use of root or privileged escalation, you help reduce the risk of security breaches. You can help ensure that, even if a container is compromised, the potential damage is minimized.", "type": "recommendation", - "guid": "4658f193-0b5d-41c9-b2c9-0a5f500799ea" + "guid": "58196b64-baca-4b92-a864-9bf2ea44c3b0" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Control cluster egress traffic.", - "description": "Ensure your cluster's outbound traffic is passing through a network security point such as Azure Firewall or an HTTP proxy.", + "text": "(Cluster) Control cluster egress traffic by ensuring that your cluster's outbound traffic passes through a network security point such as Azure Firewall or an HTTP proxy.", + "description": "By routing outbound traffic through Azure Firewall or an HTTP proxy, you can help enforce security policies that prevent unauthorized access and data exfiltration. This approach also simplifies the administration of security policies and makes it easier to enforce consistent rules across your entire AKS cluster.", "type": "recommendation", - "guid": "fa580200-65eb-4cc8-9c23-a5e68f1d86a7" + "guid": "a7cf042f-e553-4733-b74f-7338c303c9e7" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Use the open-source Microsoft Entra Workload ID and Secrets Store CSI Driver with Azure Key Vault.", - "description": "Protect and rotate secrets, certificates, and connection strings in Azure Key Vault with strong encryption. Provides an access audit log, and keeps core secrets out of the deployment pipeline.", + "text": "(Cluster) Use the open-source Microsoft Entra Workload ID and Secrets Store CSI Driver with Key Vault.", + "description": "These features help you protect and rotate secrets, certificates, and connection strings in Key Vault by using strong encryption. They provide an access audit log and keep core secrets out of the deployment pipeline.", "type": "recommendation", - "guid": "0f5b5089-02fc-4f46-839c-7ff86610366a" + "guid": "394390a7-a5db-4d98-9622-5a1fe06fb0e2" }, { "waf": "Security", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Use Microsoft Defender for Containers.", - "description": "Monitor and maintain the security of your clusters, containers, and their applications.", + "text": "(Cluster) Use Microsoft Defender for Containers.", + "description": "Microsoft Defender for Containers helps you monitor and maintain the security of your clusters, containers, and their applications.", "type": "recommendation", - "guid": "d2895e72-e9c1-4af7-9e5a-53edc92423e7" + "guid": "5d3287f5-b7a3-4a34-b43e-2ebbe8113c6b" }, { "waf": "cost", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Use appropriate VM SKU per node pool and reserved instances where long-term capacity is expected.", + "text": "(Cluster) Include the pricing tiers for AKS in your cost model. To estimate costs, use the Azure pricing calculator and test different configuration and payment plans in the calculator.", "description": "", "type": "checklist", - "guid": "c5148caa-6478-4ba8-993b-1a8640716d66" + "guid": "37411aa4-30c6-4bc6-b7ff-0a14af08720b" }, { "waf": "cost", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Use appropriate managed disk tier and size.", + "text": "(Cluster) Get the best rates for your workload. Use the appropriate VM SKU for each node pool because it directly affects the cost to run your workloads. Choosing a high-performance VM without proper utilization can lead to wasteful spending. Selecting a less powerful VM can cause performance problems and increased downtime.", "description": "", "type": "checklist", - "guid": "90ac2d0c-790e-4b2b-be35-60a55a19397a" + "guid": "d7cafa89-dbe5-4df3-ba91-755ad20a0ab5" }, { "waf": "cost", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Review performance metrics, starting with CPU, memory, storage, and network, to identify cost optimization opportunities by cluster, nodes, and namespace.", + "text": "(Cluster and workload) Optimize workload components costs. Choose the most cost-effective region for your workload. Evaluate the cost, latency, and compliance requirements to ensure that you run your workload cost-effectively and that it doesn't affect your customers or create extra networking charges. The region where you deploy your workload in Azure can significantly affect the cost. Because of many factors, the cost of resources varies for each region in Azure.", "description": "", "type": "checklist", - "guid": "df9bb42d-c34f-488f-a50c-0624ca247647" + "guid": "6f61fef0-ca2f-429b-a482-e1a94669b2d0" }, { "waf": "cost", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architecture: Use autoscalers to scale in when workloads are less active.", + "text": "(Cluster and workload) Optimize workload scaling costs. Consider alternative vertical and horizontal scaling configurations to reduce scaling costs while still meeting all workload requirements. Use autoscalers to scale in when workloads are less active.", "description": "", "type": "checklist", - "guid": "43ffe0b7-7bc4-42b0-a370-af4a943b19c0" + "guid": "f99708d6-8940-4ccc-9705-68ded62db836" }, { - "waf": "Cost", - "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Align SKU selection and managed disk size with workload requirements.", - "description": "Matching your selection to your workload demands ensures you don't pay for unneeded resources.", - "type": "recommendation", - "guid": "c4ce3529-2891-41f1-aa3f-e87a791b01a3" - }, - { - "waf": "Cost", - "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Select the right virtual machine instance type.", - "description": "Selecting the right virtual machine instance type is critical as it directly impacts the cost of running applications on AKS. Choosing a high-performance instance without proper utilization can lead to wasteful spending, while choosing a less powerful instance can lead to performance issues and increased downtime. To determine the right virtual machine instance type, consider workload characteristics, resource requirements, and availability needs.", - "type": "recommendation", - "guid": "1088060f-7467-48db-950d-5890503e2974" - }, - { - "waf": "Cost", - "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Select virtual machines based on the Arm architecture.", - "description": "AKS supports creating ARM64 Ubuntu agent nodes, as well as a of mix Intel and ARM architecture nodes within a cluster that can bring better performance at a lower cost.", - "type": "recommendation", - "guid": "ce50c713-ad3e-4781-9193-63485491aa48" - }, - { - "waf": "Cost", - "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Select Azure Spot Virtual Machines.", - "description": "Spot VMs allow you to take advantage of unutilized Azure capacity with significant discounts (up to 90% as compared to pay-as-you-go prices). If Azure needs capacity back, the Azure infrastructure evicts the Spot nodes.", - "type": "recommendation", - "guid": "7a281f48-217a-4668-aa30-c9d2c84d0d72" - }, - { - "waf": "Cost", - "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Select the appropriate region.", - "description": "Due to many factors, cost of resources varies per region in Azure. Evaluate the cost, latency, and compliance requirements to ensure you are running your workload cost-effectively and it doesn't affect your end-users or create extra networking charges.", - "type": "recommendation", - "guid": "433efe5b-3776-459c-8560-058f87773838" - }, - { - "waf": "Cost", + "waf": "cost", "service": "Azure Kubernetes Service", - "text": "Workload architecture: Maintain small and optimized images.", - "description": "Streamlining your images helps reduce costs since new nodes need to download these images. Build images in a way that allows the container start as soon as possible to help avoid user request failures or timeouts while the application is starting up, potentially leading to overprovisioning.", - "type": "recommendation", - "guid": "65d3f70e-457a-4d62-8140-de2b0c4f7f99" + "text": "(Cluster and workload) Collect and analyze cost data. The foundation of enabling cost optimization is the spread of a cost-saving cluster. Develop a cost-efficiency mindset that includes collaboration between finance, operations, and engineering teams to drive alignment on cost-saving goals and bring transparency to cloud costs.", + "description": "", + "type": "checklist", + "guid": "c1396707-3475-4f3c-96d5-a5667eb99e28" }, { "waf": "Cost", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Enable Cluster Autoscaler to automatically reduce the number of agent nodes in response to excess resource capacity.", - "description": "Automatically scaling down the number of nodes in your AKS cluster lets you run an efficient cluster when demand is low and scale up when demand returns.", + "text": "(Cluster and workload) Align AKS SKU selection and managed disk size with workload requirements.", + "description": "Matching your selection to your workload demands helps ensure that you don't pay for unneeded resources.", "type": "recommendation", - "guid": "b4d583e3-7268-41d9-9e93-0394bed77298" + "guid": "f9e79da6-b405-4d65-bed4-371801ac8979" }, { "waf": "Cost", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Enable Node Autoprovision to automate VM SKU selection.", - "description": "Node Autoprovision simplifies the SKU selection process and decides, based on pending pod resource requirements, the optimal VM configuration to run workloads in the most efficient and cost effective manner.", + "text": "(Cluster) Choose the right VM instance types for your AKS node pools.To determine the right VM instance types, consider workload characteristics, resource requirements, and availability needs.", + "description": "Selecting the right VM instance type is crucial because it directly affects the cost to run applications on AKS. Choosing a high-performance instance without proper utilization can lead to wasteful spending. Choosing a less powerful instance can lead to performance problems and increased downtime.", "type": "recommendation", - "guid": "48ae7de4-26f7-457c-b1a4-a18467b7401d" + "guid": "fa7b8f8b-b97b-4964-9686-b8e61d3f6e05" }, { "waf": "Cost", "service": "Azure Kubernetes Service", - "text": "Workload architecture: Use the Horizontal Pod Autoscaler.", - "description": "Adjust the number of pods in a deployment depending on CPU utilization or other select metrics, which support cluster scale-in operations.", + "text": "(Cluster) Choose VMs based on the more power efficient Azure Resource Manager architecture. AKS supports creating Arm64 node pools and a mix of Intel and Resource Manager architecture nodes within a cluster.", + "description": "The Arm64 architecture provides a better price-to-performance ratio because of its lower power utilization and efficient compute performance. These capabilities can bring better performance at a lower cost.", "type": "recommendation", - "guid": "d3194174-d037-4a35-9cd5-244d377cfabb" + "guid": "9c98cb22-2c1a-4071-8377-9a5d03fc080b" }, { "waf": "Cost", "service": "Azure Kubernetes Service", - "text": "Workload architecture: Use Vertical Pod Autoscaler (preview).", - "description": "Rightsize your pods and dynamically set requests and limits based on historic usage.", + "text": "(Cluster) Enable the cluster autoscaler to automatically reduce the number of agent nodes in response to excess resource capacity.", + "description": "Automatically scaling down the number of nodes in your AKS cluster lets you run an efficient cluster when demand is low and scale up when demand increases.", "type": "recommendation", - "guid": "091dc052-7bf8-4dd1-aca1-d514ddcb2aa8" + "guid": "d6e28fe3-d81c-4f1f-a66e-f33a76e2f386" }, { "waf": "Cost", "service": "Azure Kubernetes Service", - "text": "Workload architecture: Use Kubernetes Event Driven Autoscaling (KEDA).", - "description": "Scale based on the number of events being processed. Choose from a rich catalogue of 50+ KEDA scalers.", + "text": "(Cluster) Enable node autoprovisioning to automate VM SKU selection.", + "description": "Node autoprovision simplifies the SKU selection process and decides, based on pending pod resource requirements, the optimal VM configuration to run workloads in the most efficient and cost-effective manner.", "type": "recommendation", - "guid": "fe3d7c5f-f5a9-4b5e-a4f9-81bf76930967" + "guid": "97d8601a-8380-427f-9e32-745314d927a0" }, { "waf": "Cost", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Adopt a cloud financial discipline and cultural practice to drive ownership of cloud usage.", - "description": "The foundation of enabling cost optimization is the spread of a cost saving cluster. A financial operations approach (FinOps) is often used to help organizations reduce cloud costs. It is a practice involving collaboration between finance, operations, and engineering teams to drive alignment on cost saving goals and bring transparency to cloud costs.", + "text": "(Workload) Use HorizontalPodAutoscaler to adjust the number of pods in a deployment depending on CPU utilization or other metrics.", + "description": "Automatically scaling down the number of pods when demand is low and scaling out when demand increases results in a more cost-effective operation of your workload.", "type": "recommendation", - "guid": "3a7d1aa1-37a0-40aa-acbf-ff1852c15c93" + "guid": "48f8f32e-0729-4d60-935d-eb3ee5ed803c" }, { "waf": "Cost", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Sign up for Azure Reservations or Azure Savings Plan.", - "description": "If you properly planned for capacity, your workload is predictable and exists for an extended period of time, sign up for an Azure Reservation or a savings plan to further reduce your resource costs.", + "text": "(Workload) Use VerticalPodAutoscaler (preview) to rightsize your pods and dynamically set requests and limits based on historic usage.", + "description": "By setting resource requests and limits on containers for each workload, VerticalPodAutoscaler frees up CPU and memory for other pods and helps ensure effective utilization of your AKS clusters.", "type": "recommendation", - "guid": "ed669535-43d7-4988-9ec5-3b70762e54eb" + "guid": "d5f11c9f-a31e-45c4-a6d5-e0796cecb8db" }, { "waf": "Cost", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Configure the AKS Cost Analysis add-on.", - "description": "The cost analysis cluster extension enables you to obtain granular insight into costs associated with various Kubernetes resources in your clusters or namespaces.", + "text": "(Cluster) Configure the AKS cost analysis add-on.", + "description": "The cost analysis cluster extension enables you to obtain granular insight into costs that are associated with various Kubernetes resources in your clusters or namespaces.", "type": "recommendation", - "guid": "6588bddb-d7b6-41a6-8b87-628a758df2fe" - }, - { - "waf": "operations", - "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Use a template-based deployment using Bicep, Terraform, or others. Make sure that all deployments are repeatable, traceable, and stored in a source code repo.", - "description": "", - "type": "checklist", - "guid": "38e28f4f-7465-4e3a-bf05-4d6072d4bae6" - }, - { - "waf": "operations", - "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Build an automated process to ensure your clusters are bootstrapped with the necessary cluster-wide configurations and deployments. This is often performed using GitOps.", - "description": "", - "type": "checklist", - "guid": "1255b75f-0f3b-4fd7-96c5-17e4e30bbe11" + "guid": "f88744d4-fdf0-4081-b7cd-f53a87dabf69" }, { "waf": "operations", "service": "Azure Kubernetes Service", - "text": "Workload architecture: Use a repeatable and automated deployment processes for your workload within your software development lifecycle.", + "text": "(Cluster) Implement an infrastructure as code (IaC) deployment approach. Use a declarative, template-based deployment approach by using Bicep, Terraform, or similar tools. Make sure that all deployments are repeatable, traceable, and stored in a source code repo. For more information, see the quickstarts in the AKS product documentation.", "description": "", "type": "checklist", - "guid": "07b2d10f-e373-484e-9ece-f182a1b714b2" + "guid": "b8e44f32-13bd-4be9-ab25-806f1b0a566d" }, { "waf": "operations", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Enable diagnostics settings to ensure control plane or core API server interactions are logged.", + "text": "(Cluster and workload) Automate infrastructure and workload deployments. Use standard software solutions to manage, integrate, and automate the deployment of your cluster and workloads. Integrate deployment pipelines with your source control system and incorporate automated tests.", "description": "", "type": "checklist", - "guid": "6ec1af22-6132-43a6-9286-20333ee6e244" + "guid": "d66e2aae-f5a2-48a8-a6c9-240845bee261" }, { "waf": "operations", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Review the Best practices for monitoring Kubernetes with Azure Monitor to determine the best monitoring strategy for your workloads.", + "text": "(Cluster and workload) Implement a comprehensive monitoring strategy. Collect logs and metrics to monitor the health of the workload, identify trends in performance and reliability, and troubleshoot problems. Review the Best practices for monitoring Kubernetes with Azure Monitor and the Well-Architected Recommendations for designing and creating a monitoring system to determine the best monitoring strategy for your workloads.", "description": "", "type": "checklist", - "guid": "98844a97-9372-4cd2-ad68-aae36a43de4c" + "guid": "bf00c2be-4607-4c5f-849b-0382a633dd3a" }, { "waf": "operations", "service": "Azure Kubernetes Service", - "text": "Workload architecture: The workload should be designed to emit telemetry that can be collected, which should also include liveliness and readiness statuses.", + "text": "(Cluster and workload) Implement testing in production strategies. Testing in production uses real deployments to validate and measure an application's behavior and performance in the production environment. Use chaos engineering practices that target Kubernetes to identify application or platform reliability issues.", "description": "", "type": "checklist", - "guid": "20301c55-a815-4a3a-83ec-fe6d1789e697" + "guid": "2d675047-78a8-4893-af79-3cac3833fe92" }, { "waf": "operations", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Use chaos engineering practices that target Kubernetes to identify application or platform reliability issues.", + "text": "(Cluster and workload) Enforce workload governance. Azure Policy helps ensure consistent compliance with organizational standards, automates policy enforcement, and provides centralized visibility and control over your cluster resources.", "description": "", "type": "checklist", - "guid": "13d37985-5266-4064-97f0-7c7aa7491941" + "guid": "59378dbd-8213-4eb8-96f6-1e81e5f45f82" }, { "waf": "operations", "service": "Azure Kubernetes Service", - "text": "Workload architecture: Optimize your workload to operate and deploy efficiently in a container.", + "text": "(Cluster and workload) Use stamp-level, blue-green deployments for mission-critical workloads. A stamp-level, blue-green deployment approach can increase confidence in releasing changes and enables zero-downtime upgrades because compatibilities with downstream dependencies like the Azure platform, resource providers, and IaC modules can be validated.", "description": "", "type": "checklist", - "guid": "d96fea7e-598f-4f59-95c6-d75fc8db1c7a" + "guid": "43dffe58-2663-47da-8e40-a59f50ce282a" }, { "waf": "operations", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Enforce cluster and workload governance using Azure Policy.", + "text": "(Cluster and workload) Make workloads more sustainable. Making workloads more sustainable and cloud efficient requires combining efforts around _cost optimization_, _reducing carbon emissions_, and _optimizing energy consumption_. Optimizing the application's cost is the initial step in making workloads more sustainable.", "description": "", "type": "checklist", - "guid": "8e8fbde0-f037-4eec-bac2-634ab73d7b0a" + "guid": "205641b3-d91b-44b5-8cec-0602f958a754" }, { "waf": "Operations", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Review AKS best practices documentation.", - "description": "To build and run applications successfully in AKS, there are key considerations to understand and implement. These areas include multi-tenancy and scheduler features, cluster, and pod security, or business continuity and disaster recovery.", + "text": "(Cluster) Operationalize cluster and pod configuration standards by using Azure policies for AKS.", + "description": "Azure policies for AKS can help you apply at-scale enforcement and safeguards on your clusters in a centralized, consistent manner. Use policies to define the permissions granted to pods and ensure compliance with company policies.", "type": "recommendation", - "guid": "95ee25c6-37fc-47c0-a3e1-eea5c1324edb" + "guid": "43cc541a-fa59-4647-93f9-56385d6d51bb" }, { "waf": "Operations", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Review Azure Chaos Studio.", - "description": "Azure Chaos Studio can help simulate faults and trigger disaster recovery situations.", + "text": "(Workload) Use Kubernetes Event Driven Autoscaler (KEDA).", + "description": "KEDA allows your applications to scale based on events, like the number of events being processed. You can choose from a rich catalog of more than 50 KEDA scalers.", "type": "recommendation", - "guid": "2695b4b9-e125-4644-a27c-977eefdbce73" - }, - { - "waf": "Operations", - "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Adopt a multiregion strategy by deploying AKS clusters deployed across different Azure regions to maximize availability and provide business continuity.", - "description": "Internet facing workloads should leverage Azure Front Door or Azure Traffic Manager to route traffic globally across AKS clusters.", - "type": "recommendation", - "guid": "844d923f-cfe0-4a3a-97ff-67c072c4220c" - }, - { - "waf": "Operations", - "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Operationalize clusters and pods configuration standards with Azure Policy.", - "description": "Azure Policy can help to apply at-scale enforcement and safeguards on your clusters in a centralized, consistent manner. It can also control what functions pods are granted and if anything is running against company policy.", - "type": "recommendation", - "guid": "02f47a84-6d0c-4243-a5ab-743c85dcce67" - }, - { - "waf": "Operations", - "service": "Azure Kubernetes Service", - "text": "Workload architecture: Use platform capabilities in your release engineering process.", - "description": "Kubernetes and ingress controllers support many advanced deployment patterns for inclusion in your release engineering process. Consider patterns like blue-green deployments or canary releases.", - "type": "recommendation", - "guid": "ecbf4266-e97a-4b02-8d02-ca2fd42cea5b" - }, - { - "waf": "Operations", - "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: For mission-critical workloads, use stamp-level blue/green deployments.", - "description": "Automate your mission-critical design areas, including deployment and testing.", - "type": "recommendation", - "guid": "4646ab36-8c28-4740-afac-c9819f0f6ac9" - }, - { - "waf": "performance", - "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Perform and iterate on a detailed capacity plan exercise that includes SKU, autoscale settings, IP addressing, and failover considerations.", - "description": "", - "type": "checklist", - "guid": "de0e6d8d-65a5-46b0-891b-2ad2aa09de11" + "guid": "484d77b3-8337-49d2-b2f2-95691f5f38fd" }, { "waf": "performance", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Enable cluster autoscaler to automatically adjust the number of agent nodes in response workload demands.", + "text": "(Cluster and workload) Conduct capacity planning. Perform and iterate on a detailed capacity plan exercise that includes SKU, autoscale settings, IP addressing, and failover considerations.", "description": "", "type": "checklist", - "guid": "b78aea0e-3fd2-4dcc-b617-34a26892aa76" + "guid": "dc6ceff0-8c7f-4952-9cd8-6e572efc8f27" }, { "waf": "performance", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Use the Horizontal pod autoscaler to adjust the number of pods in a deployment depending on CPU utilization or other select metrics.", + "text": "(Cluster) Define a scaling strategy. Configure scaling to ensure that resources are adjusted efficiently to meet workload demands without overuse or waste. Use AKS features like cluster autoscaling and HorizontalPodAutoscaler to dynamically meet your workload needs with less strain on operations. Optimize your workload to operate and deploy efficiently in a container.", "description": "", "type": "checklist", - "guid": "9b168db7-a88b-4e2c-b555-bc525c7e48da" + "guid": "87795553-aec3-46e6-8af7-473568da51b7" }, { "waf": "performance", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Perform ongoing load testing activities that exercise both the pod and cluster autoscaler.", + "text": "(Cluster and workload) Conduct performance testing. Perform ongoing load testing activities that exercise both the pod and cluster autoscaler. Compare results against the performance targets and the established baselines.", "description": "", "type": "checklist", - "guid": "deadeedd-dab6-4774-84df-f5223ed6ede1" + "guid": "fba97657-e470-42cc-b444-389197ca23a7" }, { "waf": "performance", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Separate workloads into different node pools allowing independent scalling.", + "text": "(Cluster and workload) Scale workloads and flows independently. Separate workloads and flows into different node pools to allow independent scaling. Follow the guidance in Optimize workload design using flows to identify and prioritize your flows.", "description": "", "type": "checklist", - "guid": "7b1e9a65-2d5d-4eeb-a349-136895131b06" + "guid": "4156dc31-cb3f-4a39-b22a-544c0fa50e6d" }, { "waf": "Performance", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Develop a detailed capacity plan and continually review and revise.", - "description": "After formalizing your capacity plan, it should be frequently updated by continuously observing the resource utilization of the cluster.", + "text": "(Cluster) Enable cluster autoscaler to automatically adjust the number of agent nodes in response to workload demands.Use the HorizontalPodAutoscaler to adjust the number of pods in a deployment depending on CPU utilization or other metrics.", + "description": "The ability to automatically scale up or scale down the number of nodes and the number of pods in your AKS cluster lets you run an efficient, cost-effective cluster.", "type": "recommendation", - "guid": "7b4e5e8b-6e55-4db7-a330-19990bd1e0d2" + "guid": "4592b399-dc74-427c-9a82-8fcdbbf709b2" }, { "waf": "Performance", "service": "Azure Kubernetes Service", - "text": "Cluster architecture: Enable cluster autoscaler to automatically adjust the number of agent nodes in response to resource constraints.", - "description": "The ability to automatically scale up or down the number of nodes in your AKS cluster lets you run an efficient, cost-effective cluster.", + "text": "(Cluster and workload) Separate workloads into different node pools and consider scaling user node pools.", + "description": "Unlike system node pools that always require running nodes, user node pools allow you to scale up or scale down.", "type": "recommendation", - "guid": "631272b4-9ca7-4e6f-ae9b-3c55946b3924" + "guid": "e64abe7f-f274-44cf-ad09-ced79f2478d8" }, { "waf": "Performance", "service": "Azure Kubernetes Service", - "text": "Cluster and workload architectures: Separate workloads into different node pools and consider scaling user node pools.", - "description": "Unlike System node pools that always require running nodes, user node pools allow you to scale up or down.", + "text": "(Workload) Use AKS advanced scheduler features to implement advanced balancing of resources for workloads that require them.", + "description": "As you manage AKS clusters, you often need to isolate teams and workloads. Advanced features that the Kubernetes scheduler provides let you control which pods can be scheduled on certain nodes. They also let you control how multipod applications can be appropriately distributed across the cluster.", "type": "recommendation", - "guid": "85ffe644-7c4b-4f2d-b2fa-ed8e7c8d2e84" + "guid": "029bd63d-ae7f-4caf-a3db-dcb60dad5f55" }, { "waf": "Performance", "service": "Azure Kubernetes Service", - "text": "Workload architecture: Use AKS advanced scheduler features.", - "description": "Helps control balancing of resources for workloads that require them.", + "text": "(Workload) Use KEDA to build a meaningful autoscale ruleset based on signals that are specific to your workload.", + "description": "Not all scale decisions can be derived from CPU or memory metrics. Scale considerations often come from more complex or even external data points. KEDA allows your applications to scale based on events, such as the number of messages in a queue or the length of a topic lag.", "type": "recommendation", - "guid": "bb7f1396-6a84-4e6d-b6d8-5fc48b4c2b5f" - }, - { - "waf": "Performance", - "service": "Azure Kubernetes Service", - "text": "Workload architecture: Use meaningful workload scaling metrics.", - "description": "Not all scale decisions can be derived from CPU or memory metrics. Often scale considerations will come from more complex or even external data points. Use KEDA to build a meaningful auto scale ruleset based on signals that are specific to your workload.", - "type": "recommendation", - "guid": "bc908693-9e21-478c-9fde-e92ba949c89e" + "guid": "67faf938-28a1-47e6-aa80-96f98bae7ab9" }, { "waf": "reliability", @@ -4644,10 +4788,10 @@ { "waf": "reliability", "service": "Virtual Machines", - "text": "Calculate your composite service-level objectives (SLOs) based on Azure service-level agreements (SLAs). Ensure that your SLO isn't higher than the Azure SLAs to avoid unrealistic expectations and potential issues.", + "text": "Calculate your composite service-level objectives (SLOs) based on Azure service-level agreements (SLAs). Ensure that your SLO isn't higher than the Azure SLAs to avoid unrealistic expectations and potential problems.", "description": "", "type": "checklist", - "guid": "4cd18dd0-57e4-4fc9-a8ad-74c3333b6d8b" + "guid": "6dd7f7f5-fd57-48e3-bc91-e390984e8649" }, { "waf": "reliability", @@ -4660,18 +4804,18 @@ { "waf": "reliability", "service": "Virtual Machines", - "text": "Make VMs and their dependencies redundant across zones. If a VM fails, the workload should continue to function because of redundancy. Include dependencies in your redundancy choices. For example, use the built-in redundancy options that are available with disks. Use zone-redundant IPs to ensure data availability and high uptime.", + "text": "Make VMs and their dependencies redundant across zones. If a VM fails, the workload should continue to function because of redundancy. Include dependencies in your redundancy choices. For example, use the built-in redundancy options that are available with disks. Use zone-redundant IP addresses to ensure data availability and high uptime.", "description": "", "type": "checklist", - "guid": "6d76e7b9-52d3-4878-83ab-2a08df23c8f6" + "guid": "476ffc3e-5127-480c-9d10-11f1a946f33f" }, { "waf": "reliability", "service": "Virtual Machines", - "text": "Be ready to scale up and scale out to prevent service level degradation and to avoid failures. Virtual Machine Scale Sets have autoscale capabilities that create new instances as required and distribute the load across multiple VMs and availability zones.", + "text": "Be ready to scale up and scale out to prevent service-level degradation and to avoid failures. Virtual Machine Scale Sets have autoscale capabilities that create new instances as required and distribute the load across multiple VMs and availability zones.", "description": "", "type": "checklist", - "guid": "56d104f5-469e-4ead-bf17-7c81c746dabe" + "guid": "cecd7835-7c60-410d-99af-6d54ba24fa9e" }, { "waf": "reliability", @@ -4684,10 +4828,10 @@ { "waf": "reliability", "service": "Virtual Machines", - "text": "Rightsize the VMs and their dependencies. Understand your VM's expected work to ensure it's not undersized and can handle the maximum load. Have extra capacity to mitigate failures.", + "text": "Rightsize the VMs and their dependencies. Understand your VM's expected work to ensure that it's not undersized and can handle the maximum load. Have extra capacity to mitigate failures.", "description": "", "type": "checklist", - "guid": "3d633098-6fc7-4752-907c-5a02d53ffba2" + "guid": "d968c1b3-872f-4ae7-ab11-70bd46ad0d99" }, { "waf": "reliability", @@ -4717,7 +4861,7 @@ "waf": "Reliability", "service": "Virtual Machines", "text": "(VMs) Implement heath endpoints that emit instance health statuses on VMs. (Scale set) Enable automatic repairs on the scale set by specifying the preferred repair action. Consider setting a time frame during which automatic repairs pause if the VM's state changes.", - "description": "Maintain availability even if an instance is deemed unhealthy. Automatic repairs initiate recovery by replacing the faulty instance. Setting a time window can prevent inadvertent or premature repair operations.", + "description": "Maintain availability even if an instance is deemed unhealthy. Automatic repairs initiate recovery by replacing the faulty instance. Setting a time window can prevent inadvertent or premature repair operations.", "type": "recommendation", "guid": "29912435-4c36-465e-91e2-6c8fcb42b33f" }, @@ -4729,6 +4873,14 @@ "type": "recommendation", "guid": "71982e91-2b16-4f49-b91e-7c52a6e0042a" }, + { + "waf": "Reliability", + "service": "Virtual Machines", + "text": "(Scale set) Preallocate instances with standby pools.", + "description": "Standby pool instances remain dormant but are ready to take over workloads if a failure occurs. This capability enhances the system's reliability.", + "type": "recommendation", + "guid": "1916c59b-ed12-496b-8afa-2822459b26b1" + }, { "waf": "Reliability", "service": "Virtual Machines", @@ -4740,10 +4892,18 @@ { "waf": "Reliability", "service": "Virtual Machines", - "text": "(Scale set) Deploy across availability zones on scale sets. Set up at least two instances in each zone. Zone balancing equally spreads the instances across zones.", + "text": "(Scale set) Deploy across availability zones on scale sets. Set up at least two instances in each zone. Zone balancing equally spreads the instances across zones.", "description": "The VM instances are provisioned in physically separate locations within each Azure region that are tolerant to local failures. Keep in mind that, depending on resource availability, there might be an uneven number of instances across zones. Zone balancing supports availability by making sure that, if one zone is down, the other zones have sufficient instances. Two instances in each zone provide a buffer during upgrades.", "type": "recommendation", - "guid": "056b1e06-a521-4662-9c87-19371260efe6" + "guid": "922cdc8f-c776-4888-b114-1ae54a5a212d" + }, + { + "waf": "Reliability", + "service": "Virtual Machines", + "text": "(Scale set) To enhance service uptime while maintaining control over the cost implications of upgrades, enable MaxSurge.", + "description": "New instances are created in batches by using the latest scale model. After the new instances are healthy, the old instances are deleted in batches. This process continues until all instances are updated, which ensures no downtime during updates.", + "type": "recommendation", + "guid": "67718ebb-6a3d-4f1f-aab3-878bea4864e3" }, { "waf": "Reliability", @@ -4772,10 +4932,10 @@ { "waf": "security", "service": "Virtual Machines", - "text": "Identify the VMs that hold state. Make sure that data is classified according to the sensitivity labels that your organization provided. Protect data by using security controls like appropriate levels of at-rest and in-transit encryption. If you have high sensitivity requirements, consider using high-security controls like double encryption and Azure confidential computing to protect data-in-use.", + "text": "Identify the VMs that hold state. Make sure that data is classified according to the sensitivity labels that your organization provides. Protect data by using security controls like appropriate levels of at-rest and in-transit encryption. If you have high sensitivity requirements, consider using high-security controls like double encryption and Azure confidential computing to protect data-in-use.", "description": "", "type": "checklist", - "guid": "5bc85e31-f7fa-4102-8e43-6fbe7add0d89" + "guid": "465d82f1-8352-43c7-bdf9-245f7cb3c712" }, { "waf": "security", @@ -4828,10 +4988,10 @@ { "waf": "security", "service": "Virtual Machines", - "text": "Threat prevention. Protect against malware attacks and malicious actors by implementing security controls like firewalls, antivirus software, and intrusion detection systems. Determine if a Trusted Execution Environment (TEE) is required.", + "text": "Threat prevention. Protect against malware attacks and malicious actors by implementing security controls like firewalls, antivirus software, and intrusion detection systems. Determine whether a Trusted Execution Environment (TEE) is required.", "description": "", "type": "checklist", - "guid": "fb9921a8-884c-40df-a588-a554143809f2" + "guid": "6f217e25-03fc-46ab-8a40-7fbf4af9aaba" }, { "waf": "Security", @@ -4844,10 +5004,10 @@ { "waf": "Security", "service": "Virtual Machines", - "text": "(Scale set) Choose VM SKUs with security features. For example, some SKUs support BitLocker encryption, and confidential computing provides encryption of data-in-use. Review the features to understand the limitations.", + "text": "(Scale set) Choose VM SKUs that have security features. For example, some SKUs support BitLocker encryption, and confidential computing provides encryption of data-in-use. Review the features to understand the limitations.", "description": "Azure-provided features are based on signals that are captured across many tenants and can protect resources better than custom controls. You can also use policies to enforce those controls.", "type": "recommendation", - "guid": "f13132ec-ac80-4a27-a14e-b7b357523df1" + "guid": "2b1a3c8f-e985-4412-881e-5512a3585954" }, { "waf": "Security", @@ -4869,7 +5029,7 @@ "waf": "Security", "service": "Virtual Machines", "text": "(VMs) Choose secure networking options for your VM's network profile. Don't directly associate public IP addresses to your VMs and don't enable IP forwarding. Ensure that all virtual network interfaces have an associated network security group.", - "description": "You can set segmentation controls in the networking profile. Attackers scan public IP addresses, which makes VMs vulnerable to threats.", + "description": "You can set segmentation controls in the networking profile. Attackers scan public IP addresses. This activity makes VMs vulnerable to threats.", "type": "recommendation", "guid": "3c83a4bb-2b58-4fae-9a65-37490fecaf1c" }, @@ -4956,26 +5116,34 @@ { "waf": "Cost", "service": "Virtual Machines", - "text": "(VMs, scale set) Evaluate the disk options that are associated with your VM's SKUs. Determine your performance needs while keeping in mind your storage capacity needs and accounting for fluctuating workload patterns. For example, the Azure Premium SSD v2 disk allows you to granularly adjust your performance independent of the disk's size.", - "description": "Some high-performance disk types offer extra cost optimization features and strategies. The Premium SSD v2 disk's adjustment capability can reduce costs because it provides high performance without overprovisioning, which could otherwise lead to underutilized resources.", + "text": "(Scale set) Mix regular VMs with spot virtual machines. Flexible orchestration lets you distribute spot virtual machines based on a specified percentage.", + "description": "Reduce compute infrastructure costs by applying the deep discounts of spot virtual machines.", "type": "recommendation", - "guid": "14211a5a-ea48-4aa1-9f24-e46689328dda" + "guid": "38d3a88b-a84f-46f3-95ce-76a4e245027b" }, { "waf": "Cost", "service": "Virtual Machines", - "text": "(Scale set) Mix regular VMs with spot virtual machines. Flexible orchestration lets you distribute spot virtual machines based on a specified percentage.", - "description": "Reduce compute infrastructure costs by applying the deep discounts of spot virtual machines.", + "text": "(Scale set) Reduce the number of VM instances when demand decreases. Set a scale-in policy based on criteria.", + "description": "Scaling in resources when they're not in use reduces the number of VMs that run in the scale set, which saves costs.", "type": "recommendation", - "guid": "38d3a88b-a84f-46f3-95ce-76a4e245027b" + "guid": "dfdebf88-1b90-4593-8c47-0b1d9363718f" }, { "waf": "Cost", "service": "Virtual Machines", - "text": "(Scale set) Reduce the number of VM instances when demand decreases. Set a scale-in policy based on criteria. Stop VMs during off-hours. You can use the Azure Automation Start/Stop feature and configure it according to your business needs.", - "description": "Scaling in or stopping resources when they're not in use reduces the number of VMs running in the scale set, which saves costs. The Start/Stop feature is a low-cost automation option.", + "text": "(VMs) Stop VMs during off-hours. You can use the Azure Automation Start/Stop feature and configure it according to your business needs.", + "description": "The Start/Stop feature is a low-cost automation option that can significantly affect your idle instance costs.", "type": "recommendation", - "guid": "eccee9b1-70f3-4c96-8223-35e24d842201" + "guid": "7bd23403-0764-4f91-9976-e06cdcbb86df" + }, + { + "waf": "Cost", + "service": "Virtual Machines", + "text": "(VMs) Free up CPU resources by using Azure Boost.", + "description": "Offloading back-end virtualization processes frees up CPU resources for the guest virtual machines. This optimization results in improved performance. Azure Boost is only available on specific VMs, so ensure that you also choose VM sizes that have Azure Boost enabled.", + "type": "recommendation", + "guid": "6c74b2fb-96c5-49c7-98e2-8dbe8ca75610" }, { "waf": "Cost", @@ -4988,10 +5156,10 @@ { "waf": "operations", "service": "Virtual Machines", - "text": "Monitor the VM instances. Collect logs and metrics from VM instances to monitor resource usage and measure the health of the instances. Some common metrics include CPU usage, number of requests, and input/output (I/O) latency. Set up Azure Monitor alerts to be notified about issues and to detect configuration changes in your environment.", + "text": "Monitor the VM instances. Collect logs and metrics from VM instances to monitor resource usage and measure the health of the instances. Some common metrics include CPU usage, number of requests, and input/output (I/O) latency. Set up Azure Monitor alerts to be notified about problems and to detect configuration changes in your environment.", "description": "", "type": "checklist", - "guid": "fc1734b9-fefe-403d-bfc9-c2beca6f12a2" + "guid": "c599601d-aadc-4ac2-85ad-21ef8a5a2ce5" }, { "waf": "operations", @@ -5020,10 +5188,10 @@ { "waf": "operations", "service": "Virtual Machines", - "text": "Have processes for installing automatic updates. Consider using Automatic VM guest patching for a timely rollout of critical patches and security patches. Use Azure Update Manager to manage OS updates for your Windows and Linux virtual machines in Azure.", + "text": "Have processes for installing automatic updates. Consider using Automatic VM guest patching for a timely rollout of critical patches and security patches. Use Azure Update Manager to manage OS updates for your Windows and Linux VMs in Azure.", "description": "", "type": "checklist", - "guid": "821779c7-588f-40af-a740-8dd41fb585fd" + "guid": "a558ac6a-dec1-4db0-b93f-4b4fc4fa09c9" }, { "waf": "operations", @@ -5053,7 +5221,7 @@ "waf": "Operations", "service": "Virtual Machines", "text": "(Scale set) Keep your VMs up to date by setting an upgrade policy. We recommend rolling upgrades. However, if you need granular control, choose to upgrade manually. For Flexible orchestration, you can use Azure Update Manager.", - "description": "Security is the primary reason for upgrades. Security assurances for the instances shouldn't decay over time. Rolling upgrades are done in batches, which ensures all instances aren't down at the same time.", + "description": "Security is the primary reason for upgrades. Security assurances for the instances shouldn't decay over time. Rolling upgrades are done in batches. This approach ensures that all instances aren't down at the same time.", "type": "recommendation", "guid": "19de83a6-2f87-49b1-8241-d1d687f46658" }, @@ -5068,18 +5236,18 @@ { "waf": "Operations", "service": "Virtual Machines", - "text": "Install prebuilt software components as extensions as part of bootstrapping. Azure supports many extensions that can be used to configure, monitor, secure, and provide utility applications for your VMs. Enable automatic upgrades on extensions.", + "text": "Install prebuilt software components as extensions as part of bootstrapping. Azure supports many extensions that can be used to configure, monitor, secure, and provide utility applications for your VMs. Enable automatic upgrades on extensions.", "description": "Extensions can help simplify the software installation at scale without you having to manually install, configure, or upgrade it on each VM.", "type": "recommendation", - "guid": "40266c61-9685-4ee1-8cb6-a899a6b573f2" + "guid": "44bd8d77-3128-469d-a39b-315426e8f5ad" }, { "waf": "Operations", "service": "Virtual Machines", - "text": "(VMs, scale set) Monitor and measure the health of the VM instances. Deploy the Monitor agent extension to your VMs to collect monitoring data from the guest OS with OS-specific data collection rules. Enable VM insights to monitor health and performance and to view trends from the collected data. Use boot diagnostics to get information as VMs boot. Boot diagnostics also diagnose boot failures.", - "description": "Monitoring data is at the core of incident resolution. A comprehensive monitoring stack provides information about how the VMs are performing and their health. By continuously monitoring the instances, you can be ready for or prevent failures like performance overload and reliability issues.", + "text": "(VMs, scale set) Monitor and measure the health of the VM instances. Deploy the Monitor agent extension to your VMs to collect monitoring data from the guest OS with OS-specific data collection rules. Enable VM insights to monitor health and performance and to view trends from the collected data. Use boot diagnostics to get information as VMs boot. Boot diagnostics also diagnose boot failures.", + "description": "Monitoring data is at the core of incident resolution. A comprehensive monitoring stack provides information about how the VMs are performing and their health. By continuously monitoring the instances, you can be ready for or prevent failures like performance overload and reliability problems.", "type": "recommendation", - "guid": "72e210dc-5abb-4569-b650-ec27003bb1cf" + "guid": "449d3700-b50e-4525-919b-47a5f2757393" }, { "waf": "performance", @@ -5132,10 +5300,10 @@ { "waf": "Performance", "service": "Virtual Machines", - "text": "(VMs, scale set) Choose SKUs for VMs that align with your capacity planning. Have a good understanding of your workload requirements, including the number of cores, memory, storage, and network bandwidth so that you can filter out unsuitable SKUs.", - "description": "Rightsizing your VMs is a fundamental decision that significantly affects the performance of your workload. Without the right set of VMs, you might experience performance issues and accrue unnecessary costs.", + "text": "(VMs, scale set) Choose SKUs for VMs that align with your capacity planning. Have a good understanding of your workload requirements, including the number of cores, memory, storage, and network bandwidth so that you can filter out unsuitable SKUs.", + "description": "Rightsizing your VMs is a fundamental decision that significantly affects the performance of your workload. Without the right set of VMs, you might experience performance problems and accrue unnecessary costs.", "type": "recommendation", - "guid": "67cc04db-4b74-40f4-9bf6-0ba5f758b45e" + "guid": "63026105-897c-416f-bc8e-f60c23b64974" }, { "waf": "Performance", @@ -5145,14 +5313,6 @@ "type": "recommendation", "guid": "ef3de2ad-d5b6-4931-9fc7-28249d9300c0" }, - { - "waf": "Performance", - "service": "Virtual Machines", - "text": "(VMs, scale set) Set the storage profile by analyzing the disk performance of existing workloads and the VM SKU. Use Premium SSDs for production VMs. Adjust the performance of disks with Premium SSD v2. Use locally attached NVMe devices.", - "description": "Premium SSDs deliver high-performance and low-latency disk support VMs with I/O-intensive workloads. Premium SSD v2 doesn't require disk resizing, which enables high performance without excessive over-provisioning and minimizes the cost of unused capacity. When available on VM SKUs, locally attached NVMe or similar devices can offer high performance, especially for use cases that require high input/output operations per second (IOPS) and low latency.", - "type": "recommendation", - "guid": "a91aae92-2b2c-4c88-b268-2c70b995a952" - }, { "waf": "Performance", "service": "Virtual Machines", @@ -5173,34 +5333,34 @@ "categories": [], "waf": [ { - "name": "reliability" + "name": "Security" }, { - "name": "Cost" + "name": "Reliability" }, { - "name": "Operations" + "name": "security" }, { - "name": "security" + "name": "reliability" }, { - "name": "Performance" + "name": "operations" }, { - "name": "Reliability" + "name": "Operations" }, { - "name": "operations" + "name": "Performance" }, { - "name": "Security" + "name": "cost" }, { - "name": "performance" + "name": "Cost" }, { - "name": "cost" + "name": "performance" } ], "yesno": [ @@ -5237,6 +5397,6 @@ "name": "WAF Service Guides", "waf": "all", "state": "preview", - "timestamp": "October 20, 2024" + "timestamp": "May 04, 2025" } } \ No newline at end of file