#!/usr/bin/python3 # -*- coding: utf-8 -*- # Detect English module """ https://www.nostarch.com/crackingcodes (BSD Licensed) To use, type this code: >>> import detectEnglish >>> detectEnglish.isEnglish(someString) # returns True or False (There must be a "dictionary.txt" file in this directory with all English words in it, one word per line. You can download this from https://invpy.com/dictionary.txt) """ from typing import Dict UPPERLETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" LETTERS_AND_SPACE = UPPERLETTERS + UPPERLETTERS.lower() + " \t\n" # Why use a dictionary here instead of a list? # What would have been a better data structure, and less of a hack? def loadDictionary() -> Dict[str, None]: dictionaryFile = open("dictionary.txt") # should have been a set... englishWords: Dict[str, None] = {} for word in dictionaryFile.read().split("\n"): englishWords[word] = None dictionaryFile.close() return englishWords ENGLISH_WORDS = loadDictionary() def getEnglishCount(message: str) -> float: message = message.upper() message = removeNonLetters(message) possibleWords = message.split() if possibleWords == []: return 0.0 # No words at all, so return 0.0. matches = 0 for word in possibleWords: if word in ENGLISH_WORDS: matches += 1 return float(matches) / len(possibleWords) def removeNonLetters(message: str) -> str: lettersOnly = [] for symbol in message: if symbol in LETTERS_AND_SPACE: lettersOnly.append(symbol) return "".join(lettersOnly) def isEnglish( message: str, wordPercentage: int = 20, letterPercentage: int = 85 ) -> bool: # By default, 20% of the words must exist in the dictionary file, and # 85% of all the characters in the message must be letters or spaces # (not punctuation or numbers). wordsMatch = getEnglishCount(message) * 100 >= wordPercentage numLetters = len(removeNonLetters(message)) messageLettersPercentage = float(numLetters) / len(message) * 100 lettersMatch = messageLettersPercentage >= letterPercentage return wordsMatch and lettersMatch