third-party/ad-block/protocol.cc

0001 /*
0002     SPDX-License-Identifier: MPL-2.0
0003 */
0004
0005 /* Copyright (c) 2015 Brian R. Bondy. Distributed under the MPL2 license.
0006  * This Source Code Form is subject to the terms of the Mozilla Public
0007  * License, v. 2.0. If a copy of the MPL was not distributed with this
0008  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
0009
0010 #include <ctype.h>
0011 #include "./protocol.h"
0012
0013 enum ProtocolParseState {
0014     ProtocolParseStateStart,
0015     ProtocolParseStateReadingBlob,
0016     ProtocolParseStatePostBlob,
0017     ProtocolParseStateReadingProtoWebSocket,
0018     ProtocolParseStateReadingProtoHTTP,
0019     ProtocolParseStatePostProto,
0020     ProtocolParseStateReadingSeperator,
0021 };
0022
0023 /**
0024  * Checks to see if a URL is "blockable".
0025  *
0026  * Blockable URLs are ones that use one of the following protocols (any of
0027  * which can be prefixed by "blob:")
0028  *  - http
0029  *  - https
0030  *  - ws
0031  *  - wss
0032  */
0033 bool isBlockableProtocol(const char *url, int urlLen) {
0034   // First check to see if this is a blob URL.  If the URL is very short,
0035   // then trivially it isn't of the above protocols.
0036   if (urlLen <= 5) {
0037     return false;
0038   }
0039
0040   const char *curChar = url;
0041   int totalCharsRead = 0;
0042   int numCharsReadInState;
0043   char lowerChar;
0044   ProtocolParseState parseState = ProtocolParseStateStart;
0045
0046   // The below loop encodes a state machine.  Free transitions between states
0047   // are continues.  States that consume input "break" so that the can
0048   // share the common incrementing statements at the bottom of the loop.
0049   //
0050   // Its not quite as optimized as possible (some state transitions could
0051   // be collapsed) but its written in this _slightly_ more verbose way
0052   // to make it easier to grok.
0053   while (true) {
0054     switch (parseState) {
0055       case ProtocolParseStateStart:
0056         if (tolower(*curChar) == 'b') {
0057           parseState = ProtocolParseStateReadingBlob;
0058           continue;
0059         }
0060         // Intentional fall through
0061         [[fallthrough]];
0062       case ProtocolParseStatePostBlob:
0063         lowerChar = tolower(*curChar);
0064         if (lowerChar == 'w') {
0065           parseState = ProtocolParseStateReadingProtoWebSocket;
0066           continue;
0067         }
0068         if (lowerChar == 'h') {
0069           parseState = ProtocolParseStateReadingProtoHTTP;
0070           continue;
0071         }
0072         // If we're in ProtocolParseStateStart and didn't see "blob:",
0073         // "ws" or "http", or in ProtocolParseStatePostBlob
0074         // and don't see "ws" or "http" starting, then the URL doesn't match
0075         // any protocol we're interested in.
0076         return false;
0077
0078       case ProtocolParseStateReadingBlob:
0079         if (tolower(*curChar) == 'b' &&
0080             tolower(*(curChar + 1)) == 'l' &&
0081             tolower(*(curChar + 2)) == 'o' &&
0082             tolower(*(curChar + 3)) == 'b' &&
0083             tolower(*(curChar + 4)) == ':') {
0084           parseState = ProtocolParseStatePostBlob;
0085           numCharsReadInState = 5;
0086           break;
0087         }
0088         // Unexpected character read when consuming "blob:"
0089         return false;
0090
0091       case ProtocolParseStateReadingProtoHTTP:
0092         if (tolower(*curChar) == 'h' &&
0093             tolower(*(curChar + 1)) == 't' &&
0094             tolower(*(curChar + 2)) == 't' &&
0095             tolower(*(curChar + 3)) == 'p') {
0096           parseState = ProtocolParseStatePostProto;
0097           numCharsReadInState = 4;
0098           break;
0099         }
0100         // Unexpected character read when consuming "http"
0101         return false;
0102
0103       case ProtocolParseStateReadingProtoWebSocket:
0104         if (tolower(*curChar) == 'w' &&
0105             tolower(*(curChar + 1)) == 's') {
0106           parseState = ProtocolParseStatePostProto;
0107           numCharsReadInState = 2;
0108           break;
0109         }
0110         // Unexpected character read when consuming "ws"
0111         return false;
0112
0113       case ProtocolParseStatePostProto:
0114         if (tolower(*curChar) == 's') {
0115           parseState = ProtocolParseStateReadingSeperator;
0116           numCharsReadInState = 1;
0117           break;
0118         }
0119         [[fallthrough]];
0120       // Intentional fall through
0121       case ProtocolParseStateReadingSeperator:
0122         if (*curChar == ':' &&
0123             (*(curChar + 1)) == '/' &&
0124             (*(curChar + 2)) == '/') {
0125           return true;
0126         }
0127         // Unexpected character read when consuming "://"
0128         return false;
0129     }
0130
0131     // If we've read the entire URL and we haven't been able to determine
0132     // the protocol, then its trivially not a blockable protocol.
0133     totalCharsRead += numCharsReadInState;
0134     if (totalCharsRead >= urlLen) {
0135       return false;
0136     }
0137     curChar += numCharsReadInState;
0138   }
0139 }
0140